In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier

In [3]:
data = pd.read_csv("plant_disease_dataset.csv")

# Feature Engineering
data['temp_humidity'] = data['temperature'] * data['humidity']
data['temp_rain_ratio'] = data['temperature'] / (data['rainfall'] + 0.001)
data['humidity_category'] = pd.cut(data['humidity'],
                                 bins=[0, 60, 80, 100],
                                 labels=['low', 'medium', 'high'])

# Convert categorical to dummy variables
data = pd.get_dummies(data, columns=['humidity_category'], drop_first=True)

# Separate features/target
X = data.drop('disease_present', axis=1)
y = data['disease_present']


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [21]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Balanced RF": BalancedRandomForestClassifier(random_state=42),
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test_scaled)

    print(f"\n{name} Performance:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))



Random Forest Performance:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1518
           1       0.69      0.66      0.68       482

    accuracy                           0.85      2000
   macro avg       0.79      0.78      0.79      2000
weighted avg       0.84      0.85      0.85      2000

Confusion Matrix:
[[1376  142]
 [ 164  318]]

Balanced RF Performance:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1518
           1       0.69      0.66      0.68       482

    accuracy                           0.85      2000
   macro avg       0.79      0.78      0.79      2000
weighted avg       0.85      0.85      0.85      2000

Confusion Matrix:
[[1374  144]
 [ 162  320]]
