## 03 - img Processing

Image processing techniques applied (applied directly to the original data):
-  Noise Reduction (Bilateral Filter, Morphological Operations)
- Color Space Conversion (RGB ↔ HSV ↔ LAB)
- Image Enhancement 
- Edge Detection 
- Normalization 
- Aspect Ratio Preservation with Resizing

In [3]:
import os, json
from datetime import datetime
import cv2, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)

Paths & settings 

In [4]:
DATA_PATH = Path("/Users/amirah/Ghiras's datast/THE DATA")
WORK_DIR = Path("./plant_disease_project2")
LOGS_DIR = WORK_DIR / "logs"
PROCESSING_DIR = WORK_DIR / "processing_techniques"

for d in [LOGS_DIR, PROCESSING_DIR]:
    d.mkdir(parents=True, exist_ok=True)

TARGET_SIZE = (224, 224) #Target size for each image after processing is 224×224.
IMG_EXTS = [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"]

print("="*80)
print("Advanced processing for plant disease images (In-Place)")
print("Processing will be applied directly to the original data") #The processing will overwrite the original images
print("="*80 + "\n")


Advanced processing for plant disease images (In-Place)
Processing will be applied directly to the original data



Helper

In [5]:
def is_image(p: Path) -> bool: #Check file extension
    return p.suffix.lower() in IMG_EXTS

def safe_imread(path: Path, flags=cv2.IMREAD_COLOR): # Safe image read 
    try:
        img = cv2.imread(str(path), flags)  
        if img is None or img.size == 0:
            return None
        return img
    except:
        return None

def write_json(obj, path: Path): #Save JSON to disk
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

Step 1 : Loading image processing techniques

In [6]:
# Noise Reduction 
def denoise_bilateral(img):   
    try:
        return cv2.bilateralFilter(img, 9, 75, 75)
    except Exception as e:
        print(f"[WARN] Bilateral Filter failed: {e}")
        return img

In [7]:
# Contrast Enhancement - تحسين التباين
#Implementation: split into 8×8 tiles; enhance each tile independently

def enhance_contrast_clahe(img):              #CLAHE (Contrast Limited Adaptive Histogram Equalization)،Improves contrast locally (tile-wise)، Preserves natural colors
    try:
        # تحويل لـ LAB
        lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        
        # تطبيق CLAHE على قناة Lightness
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        l = clahe.apply(l)
        
        # دمج القنوات مرة أخرى
        lab = cv2.merge([l, a, b])
        result = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
        
        # التحقق من النتيجة
        if result is None or result.size == 0:
            return img
        return result
    except Exception as e:
        print(f"[WARN] CLAHE failed: {e}")
        return img

In [None]:
# Aspect-Ratio-Preserving Resize 
def resize_with_padding(img, target_size=(224, 224)):  #Technique: Aspect Ratio Preservation with Padding

    #Goal: retain the leaf’s original information
    try:
        h, w = img.shape[:2]
        target_h, target_w = target_size
        
        # التحقق من الأبعاد الصحيحة
        if h <= 0 or w <= 0:
            return img
        
        # حساب أفضل scale (استخدام INTER_LINEAR بدل INTER_AREA للسرعة)
        scale = min(target_w / w, target_h / h)
        new_w, new_h = int(w * scale), int(h * scale)
        
        # التحقق من الأبعاد الجديدة
        if new_w <= 0 or new_h <= 0:
            return img
        
        # تحجيم (INTER_LINEAR أسرع من INTER_AREA)
        resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
        
        # حساب الـ padding
        pad_top = (target_h - new_h) // 2
        pad_bottom = target_h - new_h - pad_top
        pad_left = (target_w - new_w) // 2
        pad_right = target_w - new_w - pad_left
        
        # إضافة padding بيضاء
        padded = cv2.copyMakeBorder(
            resized,
            pad_top, pad_bottom, pad_left, pad_right,
            cv2.BORDER_CONSTANT,
            value=[255, 255, 255]
        )
        
        return padded
    except Exception as e:
        print(f"[WARN] Resize failed: {e}")
        return img

In [9]:
# Normalization 
def normalize_to_uint8(img):           # Min-Max Scaling then cast to uint8, Scales values to [0, 255]
                                      #Formula: (x - min) / (max - min) * 255
    try:
        img_min = np.min(img)
        img_max = np.max(img)
        
        # التحقق من القيم
        if img_max - img_min > 0:
            normalized = (img.astype(np.float32) - img_min) / (img_max - img_min) * 255
        else:
            normalized = img.astype(np.float32)
        
        return np.uint8(np.clip(normalized, 0, 255))
    except Exception as e:
        print(f"[WARN] Normalization failed: {e}")
        return np.uint8(np.clip(img, 0, 255))

In [10]:
#Edge Detection (illustrations only) 
def detect_edges_canny(img): #Includes noise suppression
    try :
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 100, 200)
        return edges
    except Exception as e:
        return np.zeros_like(gray, dtype=np.uint8)

#Useful for outlining diseased regions
#Convert to grayscale, then apply Canny with thresholds of 100, 200 and remap the edges

Step 2 : Initial dataset scan

In [11]:
classes = sorted([d for d in DATA_PATH.iterdir() if d.is_dir()], key=lambda p: p.name)
all_items = []

for c in classes:
    files = [p for p in c.iterdir() if is_image(p)]
    for p in files:
        all_items.append({"disease_class": c.name, "path": p})

df_original = pd.DataFrame(all_items)           #Builds a DataFrame from the paths, and prints the total
total_images = len(df_original)

print(f" Number of classes: {len(classes)}") 
print(f"Total images: {total_images}\n")

 Number of classes: 96
Total images: 83668



In [12]:
# Class distribution before processing
#The distribution of images is calculated according to the class before processing, and only the first 5 are displayed, while the rest are counted

class_counts_before = df_original["disease_class"].value_counts().sort_index() 
print("Image distribution (before processing):")
for disease, count in list(class_counts_before.items())[:5]:
    print(f"  • {disease}: {count} images")
print(f"  ... and {len(class_counts_before) - 5} other classes\n")

Image distribution (before processing):
  • Apple_Apple_scab: 2520 images
  • Apple_Black_rot: 2484 images
  • Apple_Cedar_apple_rust: 2200 images
  • Apple_healthy: 2500 images
  • Blueberry_healthy: 1816 images
  ... and 90 other classes



Applying processing to all images (in-place)

In [13]:
processed_count = 0
failed_processing = []
visualization_samples = []
processing_log = []

for idx, (_, row) in enumerate(tqdm(df_original.iterrows(), total=len(df_original), 
                                     desc="Processing images in-place")):
    path = Path(row["path"])
    disease_class = row["disease_class"]
    
    # Read original image
    original = safe_imread(path)
    if original is None:
        failed_processing.append(str(path))
        continue
    
    try:
        # Processing pipeline >> Apply in sequence: Bilateral → CLAHE → Resize+Padding → Normalize

        # 1) Noise reduction (Bilateral Filter)
        denoised = denoise_bilateral(original)
        
        # 2) Contrast enhancement (CLAHE)
        enhanced = enhance_contrast_clahe(denoised)
        
        # 3) Aspect-ratio-preserving resize
        resized = resize_with_padding(enhanced, TARGET_SIZE)
        
        # 4) Normalization
        normalized = normalize_to_uint8(resized)
        
        # Save processed image over original (in-place) 
        cv2.imwrite(str(path), normalized)
        
        processed_count += 1 #Saves the result in place of the original image (In-Place) and increments the counter
        
        # Log processing
        processing_log.append({
            'image': path.name,
            'disease_class': disease_class,
            'original_shape': original.shape,
            'processed_shape': normalized.shape,
            'status': 'success'
        }) #The processing log records the before/after volume and success status 
        
        # Save illustration samples (roughly 9 examples)
        if idx % (max(1, len(df_original) // 9)) == 0 and len(visualization_samples) < 9:
            visualization_samples.append({
                'disease': disease_class,
                'original': original,
                'denoised': denoised,
                'enhanced': enhanced,
                'resized': resized,
                'normalized': normalized,
                'edges': detect_edges_canny(normalized)
            }) #Collects approximately 9 samples distributed over the entire dataset for illustrations: stores all stages of the image (original, after each step, and edges).
    
    except Exception as e:
        failed_processing.append(str(path))
        processing_log.append({
            'image': path.name,
            'disease_class': disease_class,
            'status': 'failed',
            'error': str(e)
        }) #If an error occurs, the failure is logged with an error message

print(f"\n Successfully processed images: {processed_count}")
print(f" Failed images: {len(failed_processing)}\n")

if failed_processing:
    write_json({"failed_processing": failed_processing}, LOGS_DIR / "04_failed_processing.json") #Prints the summary, and saves the list of failed images (if any) in JSON

Processing images in-place: 100%|██████████| 83668/83668 [09:29<00:00, 146.84it/s] 


 Successfully processed images: 83668
 Failed images: 0






Step 4 : Rescanning after processing

In [14]:
all_items_after = []
for c in classes:
    files = [p for p in c.iterdir() if is_image(p)]
    for p in files:
        all_items_after.append({"disease_class": c.name, "path": p})

df_after = pd.DataFrame(all_items_after)
class_counts_after = df_after["disease_class"].value_counts().sort_index() #Scans the same folder structure again and calculates the distribution after processing (to ensure that the numbers have not changed)

print("Image distribution (after processing):")
for disease, count in list(class_counts_after.items())[:5]:
    print(f"  • {disease}: {count} images")
print(f"  ... and {len(class_counts_after) - 5} other classes\n") #Displays the first 5 items of the new distribution

Image distribution (after processing):
  • Apple_Apple_scab: 2520 images
  • Apple_Black_rot: 2484 images
  • Apple_Cedar_apple_rust: 2200 images
  • Apple_healthy: 2500 images
  • Blueberry_healthy: 1816 images
  ... and 90 other classes



Step 5: Generating illustration figures.

In [15]:
if visualization_samples:
    fig = plt.figure(figsize=(20, 14))
    
    for row_idx, sample in enumerate(visualization_samples):
        # Original
        ax = plt.subplot(9, 6, row_idx * 6 + 1)
        ax.imshow(cv2.cvtColor(sample['original'], cv2.COLOR_BGR2RGB))
        ax.set_title(f"1. Original\n{sample['disease'][:15]}", fontsize=8, fontweight='bold')
        ax.axis('off')
        
        # Denoised (Bilateral Filter)
        ax = plt.subplot(9, 6, row_idx * 6 + 2)
        ax.imshow(cv2.cvtColor(sample['denoised'], cv2.COLOR_BGR2RGB))
        ax.set_title("2. Denoised\n(Bilateral Filter)", fontsize=8)
        ax.axis('off')
        
        # Enhanced (CLAHE)
        ax = plt.subplot(9, 6, row_idx * 6 + 3)
        ax.imshow(cv2.cvtColor(sample['enhanced'], cv2.COLOR_BGR2RGB))
        ax.set_title("3. Enhanced\n(CLAHE)", fontsize=8)
        ax.axis('off')
        
        # Resized (Aspect Ratio)
        ax = plt.subplot(9, 6, row_idx * 6 + 4)
        ax.imshow(cv2.cvtColor(sample['resized'], cv2.COLOR_BGR2RGB))
        ax.set_title(f"4. Resized\n{TARGET_SIZE}", fontsize=8)
        ax.axis('off')
        
        # Normalized
        ax = plt.subplot(9, 6, row_idx * 6 + 5)
        ax.imshow(sample['normalized'], cmap='viridis')
        ax.set_title("5. Normalized\n[0, 255]", fontsize=8)
        ax.axis('off')
        
        # Edge Detection
        ax = plt.subplot(9, 6, row_idx * 6 + 6)
        ax.imshow(sample['edges'], cmap='gray')
        ax.set_title("6. Edges\n(Canny)", fontsize=8)
        ax.axis('off')
    
    plt.suptitle('Complete Image Processing Pipeline (Applied In-Place)', 
                 fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.savefig(PROCESSING_DIR / "processing_pipeline_inplace.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    print(f" Saved pipeline figure: {PROCESSING_DIR / 'processing_pipeline_inplace.png'}\n")
    #Builds a large 9x6 board: for each sample, 6 images representing the processing stages

 Saved pipeline figure: plant_disease_project2/processing_techniques/processing_pipeline_inplace.png



Step 6: Generating distribution statistics

In [16]:
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Bar chart - Top 20
top_20 = class_counts_after.nlargest(20)
axes[0].barh(range(len(top_20)), top_20.values, color='steelblue')
axes[0].set_yticks(range(len(top_20)))
axes[0].set_yticklabels(top_20.index, fontsize=9)
axes[0].set_xlabel("Number of Images", fontsize=12)
axes[0].set_title("Top 20 Diseases (After Processing)", fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Pie chart - Top 15
top_15 = class_counts_after.nlargest(15)
colors = plt.cm.Set3(np.linspace(0, 1, len(top_15)))
wedges, texts, autotexts = axes[1].pie(top_15.values, labels=top_15.index, 
                                         autopct='%1.1f%%', colors=colors, startangle=90)
for autotext in autotexts:
    autotext.set_color('black')
    autotext.set_fontsize(8)
for text in texts:
    text.set_fontsize(8)
axes[1].set_title("Top 15 Diseases Distribution", fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(PROCESSING_DIR / "disease_distribution_after.png", dpi=200, bbox_inches='tight')
plt.close()

print(f" Saved statistics: {PROCESSING_DIR / 'disease_distribution_after.png'}\n")

 Saved statistics: plant_disease_project2/processing_techniques/disease_distribution_after.png



Step 7: Building final report

In [17]:
summary = {
    "project": "Plant Disease Detection - Graduation Project",
    "phase": "Advanced Image Processing (In-Place)",
    "processing_date": datetime.now().isoformat(),
    "processing_location": "Applied directly to source data",
    "source_path": str(DATA_PATH),
    "statistics": {
        "total_images_processed": processed_count,
        "failed_processing": len(failed_processing),
        "target_size": TARGET_SIZE,
        "total_diseases": len(class_counts_after)
    },
    "processing_techniques": {
        "1_denoising": {
            "name": "Bilateral Filter",
            "description": "Edge-preserving noise reduction",
            "formula": "Gaussian filtering in spatial and color (range) domains",
            "parameters": {"diameter": 9, "sigma_color": 75, "sigma_space": 75}
        },
        "2_contrast_enhancement": {
            "name": "CLAHE",
            "description": "Contrast Limited Adaptive Histogram Equalization",
            "formula": "Split image into 8×8 tiles and enhance each tile independently",
            "parameters": {"clip_limit": 2.0, "tile_grid_size": 8}
        },
        "3_resizing": {
            "name": "Aspect Ratio Preserving Resize",
            "description": "Resize with white padding to maintain proportions",
            "formula": "Compute optimal scale + add white padding",
            "target_size": TARGET_SIZE
        },
        "4_normalization": {
            "name": "Min-Max Scaling",
            "description": "Normalize pixel values to [0, 255]",
            "formula": "(x - min) / (max - min) * 255",
            "output_range": "[0, 255]"
        }
    },
    "disease_distribution": class_counts_after.to_dict(),
    "processing_log_samples": processing_log[:10]
}

write_json(summary, LOGS_DIR / "05_processing_summary_inplace.json")

In [18]:
# Save full processing log
df_processing_log = pd.DataFrame(processing_log)
df_processing_log.to_csv(LOGS_DIR / "processing_log.csv", index=False)

Final summary 

In [19]:
print("="*80)
print(" In-place processing completed successfully!")
print("="*80)
print(f"\n Summary:")
print(f"  • Processed images: {processed_count}")
print(f"  • Failed images: {len(failed_processing)}")
print(f"  • Number of classes: {len(class_counts_after)}")
print(f"  • Final target size: {TARGET_SIZE}")
print(f"  • Final value range: [0, 255]")

print(f"\n Applied image processing techniques:")
print(f"Bilateral Filter — edge-preserving denoising")
print(f"CLAHE — local contrast enhancement")
print(f"Aspect-Ratio Preserving Resize — no geometric distortion")
print(f"Min-Max Normalization — scale to [0, 255]")
print(f"Canny Edge Detection — (illustrations only)")

print(f"\n Outputs:")
print(f"  • Processed data location: {DATA_PATH.resolve()}")
print(f"  • Illustration figures: {PROCESSING_DIR}")
print(f"  • Logs: {LOGS_DIR}")

print(f"\n Saved files:")
print(f"  • processing_pipeline_inplace.png — pipeline figure")
print(f"  • disease_distribution_after.png — distribution stats")
print(f"  • 05_processing_summary_inplace.json — comprehensive summary")
print(f"  • processing_log.csv — processing log")

print(f"\n✨ The original dataset is now processed and ready for training!")
print("="*80)

 In-place processing completed successfully!

 Summary:
  • Processed images: 83668
  • Failed images: 0
  • Number of classes: 95
  • Final target size: (224, 224)
  • Final value range: [0, 255]

 Applied image processing techniques:
Bilateral Filter — edge-preserving denoising
CLAHE — local contrast enhancement
Aspect-Ratio Preserving Resize — no geometric distortion
Min-Max Normalization — scale to [0, 255]
Canny Edge Detection — (illustrations only)

 Outputs:
  • Processed data location: /Users/amirah/Ghiras's datast/THE DATA
  • Illustration figures: plant_disease_project2/processing_techniques
  • Logs: plant_disease_project2/logs

 Saved files:
  • processing_pipeline_inplace.png — pipeline figure
  • disease_distribution_after.png — distribution stats
  • 05_processing_summary_inplace.json — comprehensive summary
  • processing_log.csv — processing log

✨ The original dataset is now processed and ready for training!
