In [8]:
# 1. Import Libraries
# ---------------------------------------
import os
import shutil
import hashlib
from PIL import Image
from sklearn.model_selection import train_test_split
import json
from tqdm import tqdm

In [9]:
# 2. Define Paths
# ---------------------------------------
RAW_DATA_DIR = "EuroSAT/2750"       
PROCESSED_DATA_DIR = "data/processed"  

In [10]:
# 3. Check Dataset Structure
# ---------------------------------------
classes = os.listdir(RAW_DATA_DIR)
print("Classes found:", classes)

# Count images per class
for cls in classes:
    folder = os.path.join(RAW_DATA_DIR, cls)
    print(f"{cls}: {len(os.listdir(folder))} images")

Classes found: ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']
AnnualCrop: 3000 images
Forest: 3000 images
HerbaceousVegetation: 3000 images
Highway: 2500 images
Industrial: 2500 images
Pasture: 2000 images
PermanentCrop: 2500 images
Residential: 3000 images
River: 2500 images
SeaLake: 3000 images


In [11]:
# 4. Remove Duplicates
# ---------------------------------------
def remove_duplicates(folder):
    hashes = {}
    duplicates = []
    
    for img_name in os.listdir(folder):
        img_path = os.path.join(folder, img_name)
        try:
            with open(img_path, "rb") as f:
                filehash = hashlib.md5(f.read()).hexdigest()
            if filehash in hashes:
                duplicates.append(img_path)
            else:
                hashes[filehash] = img_path
        except:
            continue
    
    # Remove duplicate images
    for dup in duplicates:
        os.remove(dup)
    return len(duplicates)

total_removed = 0
for cls in classes:
    folder = os.path.join(RAW_DATA_DIR, cls)
    removed = remove_duplicates(folder)
    total_removed += removed
    print(f"Removed {removed} duplicates from {cls}")

print(f"Total duplicates removed: {total_removed}")

Removed 0 duplicates from AnnualCrop
Removed 0 duplicates from Forest
Removed 0 duplicates from HerbaceousVegetation
Removed 0 duplicates from Highway
Removed 0 duplicates from Industrial
Removed 0 duplicates from Pasture
Removed 0 duplicates from PermanentCrop
Removed 0 duplicates from Residential
Removed 0 duplicates from River
Removed 0 duplicates from SeaLake
Total duplicates removed: 0


In [12]:
# 5. Check Corrupted Images
# ---------------------------------------
def check_and_clean_images(folder):
    corrupted = []
    for img_name in tqdm(os.listdir(folder), desc=f"Checking {folder}"):
        img_path = os.path.join(folder, img_name)
        try:
            img = Image.open(img_path)
            img.verify()  # Verify image integrity
        except: 
            corrupted.append(img_path)
            os.remove(img_path)
    return corrupted

corrupted_total = []
for cls in classes:
    folder = os.path.join(RAW_DATA_DIR, cls)
    corrupted = check_and_clean_images(folder)
    corrupted_total.extend(corrupted)
    print(f"Removed {len(corrupted)} corrupted images from {cls}")

print(f"Total corrupted removed: {len(corrupted_total)}")

Checking EuroSAT/2750\AnnualCrop: 100%|██████████| 3000/3000 [00:00<00:00, 12788.07it/s]


Removed 0 corrupted images from AnnualCrop


Checking EuroSAT/2750\Forest: 100%|██████████| 3000/3000 [00:00<00:00, 13077.83it/s]


Removed 0 corrupted images from Forest


Checking EuroSAT/2750\HerbaceousVegetation: 100%|██████████| 3000/3000 [00:00<00:00, 12649.72it/s]


Removed 0 corrupted images from HerbaceousVegetation


Checking EuroSAT/2750\Highway: 100%|██████████| 2500/2500 [00:00<00:00, 13285.47it/s]


Removed 0 corrupted images from Highway


Checking EuroSAT/2750\Industrial: 100%|██████████| 2500/2500 [00:00<00:00, 13612.28it/s]


Removed 0 corrupted images from Industrial


Checking EuroSAT/2750\Pasture: 100%|██████████| 2000/2000 [00:00<00:00, 12764.75it/s]


Removed 0 corrupted images from Pasture


Checking EuroSAT/2750\PermanentCrop: 100%|██████████| 2500/2500 [00:00<00:00, 13516.71it/s]


Removed 0 corrupted images from PermanentCrop


Checking EuroSAT/2750\Residential: 100%|██████████| 3000/3000 [00:00<00:00, 13751.91it/s]


Removed 0 corrupted images from Residential


Checking EuroSAT/2750\River: 100%|██████████| 2500/2500 [00:00<00:00, 13403.95it/s]


Removed 0 corrupted images from River


Checking EuroSAT/2750\SeaLake: 100%|██████████| 3000/3000 [00:00<00:00, 14574.91it/s]

Removed 0 corrupted images from SeaLake
Total corrupted removed: 0





In [13]:
# 6. Split Train/Val/Test
# ---------------------------------------
def split_and_copy():
    for cls in classes:
        folder = os.path.join(RAW_DATA_DIR, cls)
        images = os.listdir(folder)
        
        train_imgs, test_imgs = train_test_split(images, test_size=0.3, random_state=42)
        val_imgs, test_imgs = train_test_split(test_imgs, test_size=0.5, random_state=42)
        
        splits = {"train": train_imgs, "val": val_imgs, "test": test_imgs}
        
        for split, split_imgs in splits.items():
            split_dir = os.path.join(PROCESSED_DATA_DIR, split, cls)
            os.makedirs(split_dir, exist_ok=True)
            
            for img_name in split_imgs:
                src = os.path.join(folder, img_name)
                dst = os.path.join(split_dir, img_name)
                shutil.copy(src, dst)

split_and_copy()
print("✅ Train/Val/Test split completed and copied to processed folder.")

✅ Train/Val/Test split completed and copied to processed folder.


In [14]:
# 7. Save Dataset Summary
# ---------------------------------------
summary = {}

for split in ["train", "val", "test"]:
    summary[split] = {}
    for cls in classes:
        folder = os.path.join(PROCESSED_DATA_DIR, split, cls)
        count = len(os.listdir(folder))
        summary[split][cls] = count

with open("data/dataset_summary.json", "w") as f:
    json.dump(summary, f, indent=4)

print("✅ Dataset summary saved to data/dataset_summary.json")

✅ Dataset summary saved to data/dataset_summary.json
