## 02 - Cleaning

In [2]:
import os, json, hashlib
from collections import defaultdict
from datetime import datetime
import cv2, numpy as np, pandas as pd
from pathlib import Path
from tqdm import tqdm
import random

pd.set_option('display.max_rows', None)

In [3]:
DATA_PATH = Path("/Users/amirah/Ghiras's datast/THE DATA")
WORK_DIR = Path("./plant_disease_project")
LOGS_DIR = WORK_DIR / "logs"

LOGS_DIR.mkdir(parents=True, exist_ok=True)

IMG_EXTS = [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"]
RANDOM_SEED = 42

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

HELPERS

In [4]:
def is_image(p: Path) -> bool:
    #التحقق من أن الملف صورة
    return p.suffix.lower() in IMG_EXTS

def safe_imread(path: Path, flags=cv2.IMREAD_COLOR):
    """قراءة آمنة للصورة - ترجع None لو معطوبة"""
    try:
        img = cv2.imread(str(path), flags)
        if img is None or img.size == 0:
            return None
        return img
    except Exception as e:
        return None

def md5_of_file(path: Path, chunk_size=1<<20):
    """حساب MD5 hash للكشف عن التكرار"""
    try:
        h = hashlib.md5()
        with open(path, "rb") as f:
            for chunk in iter(lambda: f.read(chunk_size), b""):
                h.update(chunk)
        return h.hexdigest()
    except Exception as e:
        return None

def write_json(obj, path: Path):
    """حفظ JSON"""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

SCAN CLASSES & FILES

In [5]:
classes = sorted([d for d in DATA_PATH.iterdir() if d.is_dir()], key=lambda p: p.name)
assert classes, " No class folders found!"

all_items = []
for c in classes:
    files = [p for p in c.iterdir() if is_image(p)]
    for p in files:
        all_items.append({"disease_class": c.name, "path": p})

df_original = pd.DataFrame(all_items)
total_original = len(df_original)

print(f" Number of classes: {len(classes)}")
print(f" Total original images: {total_original}\n")

# Original class distribution
class_counts = df_original["disease_class"].value_counts().sort_index()
print("Image distribution by class (before cleaning):")
for disease, count in class_counts.items():
    print(f"  • {disease}: {count} images")
print()

 Number of classes: 96
 Total original images: 83890

Image distribution by class (before cleaning):
  • Apple_Apple_scab: 2520 images
  • Apple_Black_rot: 2484 images
  • Apple_Cedar_apple_rust: 2200 images
  • Apple_healthy: 2510 images
  • Blueberry_healthy: 1816 images
  • Cherry_Powdery_mildew: 1683 images
  • Cherry_healthy: 1826 images
  • Corn_Cercospora_leaf_spot Gray_leaf_spot: 1642 images
  • Corn_Common_rust_: 1907 images
  • Corn_Northern_Leaf_Blight: 1908 images
  • Corn_healthy: 1859 images
  • Cucumber ـBacterialـWilt: 160 images
  • Cucumber_Gummy_Stem_Blight: 160 images
  • Cucumber_Pythium_Fruit_Rot: 169 images
  • CucumberـAnthracnose: 160 images
  • CucumberـBelly Rot: 160 images
  • CucumberـDownyـMildew: 160 images
  • Cucumberـhealthy: 160 images
  • Downy_mildew_on_lettuce: 30 images
  • Grape_Black_rot: 1888 images
  • Grape_Esca_(Black_Measles): 1920 images
  • Grape___Leaf_blight_(Isariopsis_Leaf_Spot): 1722 images
  • Grape_healthy: 1692 images
  • Lemon_An

In [6]:
# Corrupted image detection

corrupted_list = []
valid_items = []

for _, row in tqdm(df_original.iterrows(), total=len(df_original), desc="Checking image integrity"):
    path = Path(row["path"])
    
    # Try to read image
    img = safe_imread(path)
    
    if img is None:
        corrupted_list.append({
            'path': str(path),
            'disease_class': row["disease_class"],
            'filename': path.name
        })
    else:
        valid_items.append(row.to_dict())

df_valid = pd.DataFrame(valid_items)
total_corrupted = len(corrupted_list)

print(f"\n Valid images: {len(df_valid)}")
print(f" Corrupted images: {total_corrupted}\n")

if corrupted_list:
    df_corrupted = pd.DataFrame(corrupted_list)
    
    print("Corrupted image list:")
    print(df_corrupted[['disease_class', 'filename']].to_string(index=False))
    print()
    
    # Save corrupted list
    write_json(corrupted_list, LOGS_DIR / "01_corrupted_images.json")
    df_corrupted.to_csv(LOGS_DIR / "01_corrupted_images.csv", index=False)
    
    # Delete corrupted images from source
    print("[Deleting corrupted images from source...]\n")
    deleted_count = 0
    for item in tqdm(corrupted_list, desc="Deleting corrupted images"):
        try:
            Path(item['path']).unlink(missing_ok=True)
            deleted_count += 1
        except Exception as e:
            print(f"Failed to delete: {item['path']}")
    
    print(f"\n Deleted {deleted_count} corrupted images from source\n")

Checking image integrity: 100%|██████████| 83890/83890 [02:50<00:00, 491.89it/s] 


 Valid images: 83890
 Corrupted images: 0






DUPLICATE DETECTION

In [7]:
duplicates_info = {}
duplicates_to_remove = []

# Check duplicates within each class
for disease in tqdm(sorted(df_valid["disease_class"].unique()), desc="Searching for duplicates"):
    sub = df_valid[df_valid["disease_class"] == disease].reset_index(drop=True)
    seen = {}  # hash -> first_path
    
    for _, row in sub.iterrows():
        path = Path(row["path"])
        file_hash = md5_of_file(path)
        
        if file_hash is None:
            continue
        
        # If the hash already exists => duplicate image
        if file_hash in seen:
            # Keep the first one, remove the later one
            first_path = seen[file_hash]
            
            key = (disease, file_hash)
            if key not in duplicates_info:
                duplicates_info[key] = {
                    'disease': disease,
                    'hash': file_hash,
                    'kept': str(first_path),
                    'duplicates': []
                }
            
            duplicates_info[key]['duplicates'].append(str(path))
            duplicates_to_remove.append({
                'path': str(path),
                'disease_class': disease,
                'filename': path.name,
                'hash': file_hash
            })
        else:
            seen[file_hash] = path

total_duplicates = len(duplicates_to_remove)

print(f"\n Number of duplicate groups: {len(duplicates_info)}")
print(f" Number of duplicate images (to remove): {total_duplicates}\n")

if duplicates_to_remove:
    df_dups = pd.DataFrame(duplicates_to_remove)
    
    print("List of duplicate images (to be removed):")
    print(df_dups[['disease_class', 'filename']].head(10).to_string(index=False))
    if len(df_dups) > 10:
        print(f"... and {len(df_dups) - 10} more images")
    print()
    
    # Save duplicate list
    write_json(duplicates_to_remove, LOGS_DIR / "02_duplicates_to_remove.json")
    df_dups.to_csv(LOGS_DIR / "02_duplicates_summary.csv", index=False)
    
    # Delete duplicate images from source
    print("[Deleting duplicate images from source...]\n")
    deleted_dup_count = 0
    for item in tqdm(duplicates_to_remove, desc="Deleting duplicate images"):
        try:
            Path(item['path']).unlink(missing_ok=True)
            deleted_dup_count += 1
        except Exception as e:
            print(f"Failed to delete: {item['path']}")
    
    print(f"\n Deleted {deleted_dup_count} duplicate images from source\n")

Searching for duplicates: 100%|██████████| 95/95 [00:26<00:00,  3.52it/s]



 Number of duplicate groups: 206
 Number of duplicate images (to remove): 222

List of duplicate images (to be removed):
disease_class                                                     filename
Apple_healthy        dc18b924-f172-445d-8fed-61445d437aaa___RS_HL 6270.JPG
Apple_healthy        acb21cc2-8d65-4880-a7bb-dcc1eab1564b___RS_HL 6272.JPG
Apple_healthy        c21cf428-bfc3-4710-b5d2-69d1c0e94748___RS_HL 6268.JPG
Apple_healthy dc18b924-f172-445d-8fed-61445d437aaa___RS_HL 6270_flipTB.JPG
Apple_healthy        13298d36-4425-437d-ae8e-c7d70e200084___RS_HL 6271.JPG
Apple_healthy        5192db55-4aa7-421c-92d4-c2dac79e7379___RS_HL 6273.JPG
Apple_healthy        fdbfa6f7-f887-442d-8df1-1f0cf839fc4d___RS_HL 6274.JPG
Apple_healthy fdbfa6f7-f887-442d-8df1-1f0cf839fc4d___RS_HL 6274_flipTB.JPG
Apple_healthy        3673d121-b5de-481c-b057-d4ee5b4959b1___RS_HL 6269.JPG
Apple_healthy c21cf428-bfc3-4710-b5d2-69d1c0e94748___RS_HL 6268_flipTB.JPG
... and 212 more images

[Deleting duplicate images f

Deleting duplicate images: 100%|██████████| 222/222 [00:00<00:00, 8552.26it/s]


 Deleted 222 duplicate images from source






RESCAN AFTER DEDUP

In [8]:
all_items_after = []
for c in classes:
    files = [p for p in c.iterdir() if is_image(p)]
    for p in files:
        all_items_after.append({"disease_class": c.name, "path": p})

df_final = pd.DataFrame(all_items_after)
total_final = len(df_final)

print(f"✓ Total images after cleaning: {total_final}\n")

# Class distribution after cleaning
class_counts_after = df_final["disease_class"].value_counts().sort_index()
print("Image distribution by class (after cleaning):")
for disease, count in class_counts_after.items():
    print(f"  • {disease}: {count} images")
print()

✓ Total images after cleaning: 83668

Image distribution by class (after cleaning):
  • Apple_Apple_scab: 2520 images
  • Apple_Black_rot: 2484 images
  • Apple_Cedar_apple_rust: 2200 images
  • Apple_healthy: 2500 images
  • Blueberry_healthy: 1816 images
  • Cherry_Powdery_mildew: 1683 images
  • Cherry_healthy: 1826 images
  • Corn_Cercospora_leaf_spot Gray_leaf_spot: 1642 images
  • Corn_Common_rust_: 1907 images
  • Corn_Northern_Leaf_Blight: 1908 images
  • Corn_healthy: 1859 images
  • Cucumber ـBacterialـWilt: 159 images
  • Cucumber_Gummy_Stem_Blight: 160 images
  • Cucumber_Pythium_Fruit_Rot: 160 images
  • CucumberـAnthracnose: 160 images
  • CucumberـBelly Rot: 160 images
  • CucumberـDownyـMildew: 160 images
  • Cucumberـhealthy: 160 images
  • Downy_mildew_on_lettuce: 30 images
  • Grape_Black_rot: 1888 images
  • Grape_Esca_(Black_Measles): 1920 images
  • Grape___Leaf_blight_(Isariopsis_Leaf_Spot): 1722 images
  • Grape_healthy: 1692 images
  • Lemon_Anthracnose: 251 im

Statistical Summary

In [9]:

summary_stats = {
    "original_total": int(total_original),
    "corrupted_removed": int(total_corrupted),
    "duplicates_removed": int(total_duplicates),
    "final_total": int(total_final),
    "num_diseases": len(classes),
    "diseases": class_counts_after.to_dict(),
    "cleaning_date": datetime.now().isoformat()
}

write_json(summary_stats, LOGS_DIR / "03_cleaning_summary.json")

Final Summary

In [10]:
print("="*80)
print(" Cleaning summary")
print("="*80)
print(f"\n Statistics:")
print(f"  • Original images: {total_original}")
print(f"  • Corrupted images (removed): {total_corrupted}")
print(f"  • Duplicate images (removed): {total_duplicates}")
print(f"  • Final clean images: {total_final}")
print(f"  • Remaining percentage: {(total_final/total_original*100):.2f}%")
print(f"\n  • Number of diseases: {len(classes)}")

print(f"\n Saved files:")
print(f"  • 01_corrupted_images.json - Corrupted image list")
print(f"  • 01_corrupted_images.csv - Corrupted images (table)")
print(f"  • 02_duplicates_to_remove.json - Duplicate image list")
print(f"  • 02_duplicates_summary.csv - Duplicates summary (table)")
print(f"  • 03_cleaning_summary.json - Full statistical summary")

print(f"\n Path: {LOGS_DIR.resolve()}")

print("\n" + "="*80)
print(" Cleaning completed successfully!")
print("Next step: apply advanced image processing")
print("="*80)

 Cleaning summary

 Statistics:
  • Original images: 83890
  • Corrupted images (removed): 0
  • Duplicate images (removed): 222
  • Final clean images: 83668
  • Remaining percentage: 99.74%

  • Number of diseases: 96

 Saved files:
  • 01_corrupted_images.json - Corrupted image list
  • 01_corrupted_images.csv - Corrupted images (table)
  • 02_duplicates_to_remove.json - Duplicate image list
  • 02_duplicates_summary.csv - Duplicates summary (table)
  • 03_cleaning_summary.json - Full statistical summary

 Path: /Users/amirah/Ghiras's datast/plant_disease_project/logs

 Cleaning completed successfully!
Next step: apply advanced image processing
