# Étape 1 — Collecte & organisation du dataset 



1. Importer les dépendances

In [15]:
from dotenv import load_dotenv
import os
import kagglehub
import shutil
import random
from PIL import Image

2. Load kaggle api key from .env file

In [None]:
# Load .env first
load_dotenv(override=True, verbose=True)

# verify where is the kaggle config dir
print("KAGGLE_CONFIG_DIR:", os.getenv("KAGGLE_CONFIG_DIR"))
# verify where kaggle will cache datasets
print("KAGGLEHUB_CACHE:", os.getenv("KAGGLEHUB_CACHE"))

KAGGLE_CONFIG_DIR: /home/syntaxerror/India S5/Computer vision/DL-1/svm-nature-scene-classifier
KAGGLEHUB_CACHE: /home/syntaxerror/India S5/Computer vision/DL-1/svm-nature-scene-classifier


3. Télécharger le dataset Kaggle avec kagglehub

In [2]:
import kagglehub

# Download the latest version of the dataset
path = kagglehub.dataset_download("prithivsakthiur/multilabel-geoscenenet-16k")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/prithivsakthiur/multilabel-geoscenenet-16k?dataset_version_number=1...


100%|██████████| 214M/214M [03:27<00:00, 1.08MB/s] 

Extracting files...





Path to dataset files: /home/syntaxerror/India S5/Computer vision/DL-1/svm-nature-scene-classifier/datasets/prithivsakthiur/multilabel-geoscenenet-16k/versions/1


4. Déplacer les données du dossier du jeu de données Kaggle installé vers le dossier data

In [5]:
# Entrez le dossier GeoSceneNet16K 
geo_folder = os.path.join(path, "GeoSceneNet16K")

# Le chemin du dossier cible où les dossiers seront déplacés
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data")
os.makedirs(data_dir, exist_ok=True)  # create if it doesn't exist

# La liste des dossiers à déplacer
folders_to_move = ["Sea or Ocean", "Forest Area", "Desert"]

# Boucle pour déplacer chaque dossier
for folder_name in folders_to_move:
    src_path = os.path.join(geo_folder, folder_name)
    dest_path = os.path.join(data_dir, folder_name)
    
    if os.path.exists(src_path):
        # Supprimer le dossier de destination s'il existe déjà
        if os.path.exists(dest_path):
            shutil.rmtree(dest_path)
        shutil.move(src_path, dest_path)
        print(f"Moved '{folder_name}' to '{data_dir}'")
    else:
        print(f"Folder '{folder_name}' does not exist in '{geo_folder}'")

Moved 'Sea or Ocean' to '/home/syntaxerror/India S5/Computer vision/DL-1/svm-nature-scene-classifier/data'
Moved 'Forest Area' to '/home/syntaxerror/India S5/Computer vision/DL-1/svm-nature-scene-classifier/data'
Moved 'Desert' to '/home/syntaxerror/India S5/Computer vision/DL-1/svm-nature-scene-classifier/data'


In [7]:
# supprimer le dossier temporaire téléchargé
kaggle_data_dir = os.path.join(project_root, "datasets")
shutil.rmtree(kaggle_data_dir)

5. Garder et renommer les dossiers selon les classes souhaitées

In [8]:
# Correspondance Anglais -> Français
rename_map = {
    "Sea or Ocean": "mer",
    "Forest Area": "paysage",
    "Desert": "desert"
}

# Rename folders
for old_name, new_name in rename_map.items():
    old_path = os.path.join(data_dir, old_name)
    new_path = os.path.join(data_dir, new_name)
    
    if os.path.exists(old_path):
        os.rename(old_path, new_path)
        print(f"Renamed '{old_name}' → '{new_name}'")
    else:
        print(f"'{old_name}' not found in {data_dir}")

Renamed 'Sea or Ocean' → 'mer'
Renamed 'Forest Area' → 'paysage'
Renamed 'Desert' → 'desert'


6. Renommer les images (mer1.jpg, mer2.jpg, ...)

In [9]:
for cls in rename_map.values():
    folder_path = os.path.join(data_dir, cls)
    images = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    for idx, img in enumerate(images, 1):
        ext = os.path.splitext(img)[1]
        new_name = f"{cls}{idx}{ext}"
        os.rename(os.path.join(folder_path, img), os.path.join(folder_path, new_name))


7. Equilibrage du dataset

In [None]:
# Nombre d'images maximum à garder
MAX_IMAGES = 2000

# Parcourir chaque sous-dossier
for folder in rename_map.values():
    folder_path = os.path.join(data_dir, folder)

    if os.path.isdir(folder_path):
        # Récupère toutes les images 
        images = [f for f in os.listdir(folder_path)
                  if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        count = len(images)

        # Supprime aléatoirement si plus que MAX_IMAGES
        if count > MAX_IMAGES:
            to_delete = count - MAX_IMAGES
            delete_samples = random.sample(images, to_delete)

            for img in delete_samples:
                os.remove(os.path.join(folder_path, img))



mer: 2274 images
paysage: 2271 images
desert: 2000 images


In [None]:
# Verifier le nombre d'images dans chaque dossier
for folder in rename_map.values():
    folder_path = os.path.join(data_dir, folder)

    if os.path.isdir(folder_path):
        # Récupère toutes les images 
        images = [f for f in os.listdir(folder_path)
                  if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        count = len(images)
        print(f"{folder}: {count} images")

mer: 2000 images
paysage: 2000 images
desert: 2000 images


8. Vérification et redimensionnement à 128x128

In [16]:
def check_and_resize(data_dir, classes, size=(128,128)):
    for cls in classes:
        folder = os.path.join(data_dir, cls)
        images = [f for f in os.listdir(folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        valid_images = []
        for img_name in images:
            img_path = os.path.join(folder, img_name)
            try:
                img = Image.open(img_path)
                img.verify()  # Vérifie la lisibilité
                # Redimensionnement
                img = Image.open(img_path).convert('RGB')
                img = img.resize(size)
                img.save(img_path)
                valid_images.append(img_name)
            except Exception as e:
                print(f"Image corrompue : {img_path}")
        print(f"{cls}: {len(valid_images)} images valides")

# Utiliser la bonne liste de classes françaises :
classes = rename_map.values()
check_and_resize(data_dir, classes)


mer: 2000 images valides
paysage: 2000 images valides
desert: 2000 images valides
