In [1]:
import os
import cv2
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
DATASET_DIR = "PlantVillage"  # path to dataset folder
IMAGE_SIZE = (256, 256)       # resize images
LBP_RADIUS = 1
LBP_POINTS = 8 * LBP_RADIUS
LBP_METHOD = 'uniform'

In [3]:
# ==============================================================================
# 2. FEATURE EXTRACTION FUNCTIONS
# ==============================================================================

def extract_color_features(image):
    """Extracts mean/std of RGB & HSV channels, plus a 3D HSV color histogram."""
    features = []
    
    # RGB mean/std (Channels are BGR in OpenCV)
    for i in range(3):
        features.append(np.mean(image[:, :, i]))
        features.append(np.std(image[:, :, i]))
    
    # Convert to HSV and get mean/std
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    for i in range(3):
        features.append(np.mean(hsv[:, :, i]))
        features.append(np.std(hsv[:, :, i]))
    
    # 3D Color histogram in HSV space
    hist = cv2.calcHist([hsv], [0, 1, 2], None, [8, 8, 8],
                        [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    features.extend(hist.flatten())
    
    return features

def extract_texture_features(image):
    """Extracts texture features using both LBP and GLCM."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # --- LBP Feature Extraction ---
    lbp = local_binary_pattern(gray, LBP_POINTS, LBP_RADIUS, method=LBP_METHOD)
    (lbp_hist, _) = np.histogram(lbp.ravel(),
                             bins=np.arange(0, LBP_POINTS + 3),
                             range=(0, LBP_POINTS + 2))
    lbp_hist = lbp_hist.astype("float")
    lbp_hist /= (lbp_hist.sum() + 1e-7)  # Normalize
    
    # --- GLCM Feature Extraction ---
    glcm = graycomatrix(gray, distances=[1, 3, 5], angles=[0, np.pi/4, np.pi/2],
                        levels=256, symmetric=True, normed=True)
    
    glcm_props = [np.mean(graycoprops(glcm, prop)) for prop in 
                  ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation']]
    
    # --- Combine features ---
    combined_texture_feats = np.hstack([lbp_hist, glcm_props])
    
    return combined_texture_feats.tolist()

def extract_shape_features(image):
    """Extracts shape features (aspect ratio, extent, solidity) from the largest contour."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return [0, 0, 0]
    
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    
    aspect_ratio = float(w) / h if h != 0 else 0
    area = cv2.contourArea(largest_contour)
    rect_area = w * h
    extent = float(area) / rect_area if rect_area != 0 else 0
    
    hull = cv2.convexHull(largest_contour)
    hull_area = cv2.contourArea(hull)
    solidity = float(area) / hull_area if hull_area != 0 else 0
    
    return [aspect_ratio, extent, solidity]



In [4]:
if __name__ == "__main__":
    # --- Data Loading and Feature Extraction ---
    print("Starting feature extraction...")
    data = []
    labels = []
    
    class_folders = [f for f in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, f))]
    
    for folder in class_folders:
        folder_path = os.path.join(DATASET_DIR, folder)
        
        for file in tqdm(os.listdir(folder_path),desc="Processing Images"):
            img_path = os.path.join(folder_path, file)
            image = cv2.imread(img_path)
            
            if image is None:
                print(f"Warning: Could not read image {img_path}. Skipping.")
                continue
            
            image_resized = cv2.resize(image, IMAGE_SIZE)
            
            # Extract all feature sets
            color_feats = extract_color_features(image_resized)
            texture_feats = extract_texture_features(image_resized)
            shape_feats = extract_shape_features(image_resized)
            
            # Combine all features into a single list
            features = color_feats + texture_feats + shape_feats
            
            data.append(features)
            labels.append(folder)
    
    print("Feature extraction complete.")

Starting feature extraction...


Processing Images: 100%|█████████████████████████████████████████████████████████████| 997/997 [01:57<00:00,  8.48it/s]
Processing Images: 100%|███████████████████████████████████████████████████████████| 1478/1478 [02:54<00:00,  8.46it/s]
Processing Images: 100%|███████████████████████████████████████████████████████████| 1000/1000 [02:04<00:00,  8.05it/s]
Processing Images: 100%|█████████████████████████████████████████████████████████████| 152/152 [00:18<00:00,  8.07it/s]
Processing Images: 100%|███████████████████████████████████████████████████████████| 1000/1000 [02:00<00:00,  8.27it/s]
Processing Images: 100%|███████████████████████████████████████████████████████████| 2127/2127 [04:21<00:00,  8.13it/s]
Processing Images: 100%|███████████████████████████████████████████████████████████| 1000/1000 [01:58<00:00,  8.46it/s]
Processing Images: 100%|███████████████████████████████████████████████████████████| 1591/1591 [03:18<00:00,  8.03it/s]
Processing Images: 100%|████████████████

Feature extraction complete.





In [5]:
  # --- Data Preparation for ML ---
print("Preparing data for training...")
df = pd.DataFrame(data)
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(
    df, 
    labels_encoded,
    test_size=0.2,
    random_state=42,
    stratify=labels_encoded
)
print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Preparing data for training...
Data split into 16510 training samples and 4128 testing samples.


In [6]:
# --- Model Training ---
print("Training the RandomForestClassifier model...")
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)
print("Model training complete.")

Training the RandomForestClassifier model...
Model training complete.


In [7]:

# --- Model Evaluation ---
print("\n--- Model Evaluation ---")
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


--- Model Evaluation ---
Model Accuracy: 96.95%

Classification Report:
                                             precision    recall  f1-score   support

              Pepper__bell___Bacterial_spot       0.99      0.97      0.98       199
                     Pepper__bell___healthy       0.95      0.99      0.97       296
                      Potato___Early_blight       0.99      0.98      0.98       200
                       Potato___Late_blight       0.95      0.97      0.96       200
                           Potato___healthy       1.00      0.77      0.87        30
                      Tomato_Bacterial_spot       0.97      0.97      0.97       426
                        Tomato_Early_blight       0.97      0.86      0.92       200
                         Tomato_Late_blight       0.96      0.96      0.96       382
                           Tomato_Leaf_Mold       0.98      0.99      0.99       190
                  Tomato_Septoria_leaf_spot       0.97      0.96      0.96  

In [None]:
# --- Model Saving ---
model_filename = 'plant_disease_classifier.joblib'
joblib.dump(clf, model_filename)
joblib.dump(le, 'label_encoder.joblib') # Also save the label encoder
print(f"\nTrained model saved as '{model_filename}'")
print(f"Label encoder saved as 'label_encoder.joblib'")