In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import joblib
from imblearn.over_sampling import SMOTE  # Import SMOTE

In [3]:
# Load dataset
file_path = 'Crop_Database.csv'  # Replace with the correct file path
data = pd.read_csv(file_path)


In [4]:
# EDA: Handling missing values
data = data.assign(
    temperature=data['temperature'].fillna(data['temperature'].mean()),
    humidity=data['humidity'].fillna(data['humidity'].mean()),
    rainfall=data['rainfall'].fillna(data['rainfall'].mean())
)

data = data.dropna(subset=['label'])

In [5]:
# Encode the categorical labels (assuming the label is categorical)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

In [6]:
# Scale numerical features
scaler = StandardScaler()
features = ['temperature', 'humidity', 'rainfall']
data[features] = scaler.fit_transform(data[features])

In [7]:
# Separate features and target
X = data.drop('label', axis=1)
y = data['label']

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [10]:
# Random Forest Model
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_classifier.predict(X_test)

In [11]:
# Evaluate Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
classification_rep_rf = classification_report(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest Precision: {precision_rf:.4f}")
print(f"Random Forest Recall: {recall_rf:.4f}")
print(f"Random Forest F1 Score: {f1_rf:.4f}")
print("\nRandom Forest Classification Report:\n", classification_rep_rf)

Random Forest Accuracy: 0.8873
Random Forest Precision: 0.8941
Random Forest Recall: 0.8873
Random Forest F1 Score: 0.8868

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84        24
           1       0.95      0.95      0.95        21
           2       0.76      0.90      0.83        21
           3       0.91      1.00      0.95        20
           4       0.84      0.94      0.89        17
           5       0.89      0.89      0.89        19
           6       0.86      0.90      0.88        20
           7       0.71      0.71      0.71         7
           8       1.00      0.85      0.92        27
           9       1.00      1.00      1.00        15
          10       0.88      0.78      0.82        18
          11       0.76      1.00      0.86        19
          12       0.79      0.83      0.81        18
          13       1.00      0.93      0.97        15
          14       1.00    

In [12]:
# XGBoost Model
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb_classifier.predict(X_test)

In [13]:
# Evaluate XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print(f"XGBoost Precision: {precision_xgb:.4f}")
print(f"XGBoost Recall: {recall_xgb:.4f}")
print(f"XGBoost F1 Score: {f1_xgb:.4f}")
print("\nXGBoost Classification Report:\n", classification_rep_xgb)

XGBoost Accuracy: 0.8681
XGBoost Precision: 0.8724
XGBoost Recall: 0.8681
XGBoost F1 Score: 0.8678

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.71      0.71        24
           1       0.95      1.00      0.98        21
           2       0.78      0.86      0.82        21
           3       0.91      1.00      0.95        20
           4       0.80      0.94      0.86        17
           5       0.90      0.95      0.92        19
           6       0.95      0.90      0.92        20
           7       0.78      1.00      0.88         7
           8       0.96      0.85      0.90        27
           9       1.00      0.87      0.93        15
          10       0.80      0.67      0.73        18
          11       0.77      0.89      0.83        19
          12       0.74      0.78      0.76        18
          13       1.00      0.93      0.97        15
          14       1.00      1.00      1.00        22
   

In [14]:
# Stacking Ensemble Model
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('xgb', XGBClassifier(random_state=42))
]
meta_model = LogisticRegression()
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train_smote, y_train_smote)
y_pred_stacking = stacking_model.predict(X_test)

In [15]:
# Evaluate Stacking Model
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
precision_stacking = precision_score(y_test, y_pred_stacking, average='weighted')
recall_stacking = recall_score(y_test, y_pred_stacking, average='weighted')
f1_stacking = f1_score(y_test, y_pred_stacking, average='weighted')
classification_rep_stacking = classification_report(y_test, y_pred_stacking)

print(f"Stacking Model Accuracy: {accuracy_stacking:.4f}")
print(f"Stacking Model Precision: {precision_stacking:.4f}")
print(f"Stacking Model Recall: {recall_stacking:.4f}")
print(f"Stacking Model F1 Score: {f1_stacking:.4f}")
print("\nStacking Model Classification Report:\n", classification_rep_stacking)

Stacking Model Accuracy: 0.8849
Stacking Model Precision: 0.8891
Stacking Model Recall: 0.8849
Stacking Model F1 Score: 0.8847

Stacking Model Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82        24
           1       0.95      0.95      0.95        21
           2       0.76      0.90      0.83        21
           3       0.91      1.00      0.95        20
           4       0.80      0.94      0.86        17
           5       0.90      0.95      0.92        19
           6       0.86      0.95      0.90        20
           7       0.86      0.86      0.86         7
           8       0.96      0.85      0.90        27
           9       1.00      0.87      0.93        15
          10       0.81      0.72      0.76        18
          11       0.77      0.89      0.83        19
          12       0.82      0.78      0.80        18
          13       1.00      0.93      0.97        15
          14       1.0

In [16]:
# Save the best model and preprocessing objects
joblib.dump(stacking_model, 'stacking_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']