In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
import warnings

# Sklearn Imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import randint, uniform

# Suppress warnings
warnings.filterwarnings('ignore')



In [None]:
# 1. Load Data
# ==============================================================================
file_path = 'crop_recommendation_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print(f"‚úÖ Data Loaded. Shape: {df.shape}")
except FileNotFoundError:
    print(f"‚ùå CRITICAL ERROR: '{file_path}' not found. Run the data repair script first.")
    raise

# 2. Safety Check (Drop classes with < 2 samples)
# ==============================================================================
class_counts = df['label'].value_counts()
rogue_classes = class_counts[class_counts < 2].index

if len(rogue_classes) > 0:
    print(f"‚ö†Ô∏è Dropping {len(rogue_classes)} classes with < 2 samples.")
    df = df[~df['label'].isin(rogue_classes)].copy()
else:
    print("‚úÖ Data Check Passed: All classes are valid.")

In [10]:
# 3. Feature Engineering (The Accuracy Booster)
# ==============================================================================
print("üß† Engineering Biological Features...")

def add_smart_features(data):
    df = data.copy()
    
    # A. Nutrient Ratios
    df['total_nutrients'] = df['N'] + df['P'] + df['K'] + 1e-5
    df['N_ratio'] = df['N'] / df['total_nutrients']
    df['P_ratio'] = df['P'] / df['total_nutrients']
    df['K_ratio'] = df['K'] / df['total_nutrients']
    
    # B. Climate Interactions
    df['aridity_index'] = df['rainfall'] / (df['temperature'] + 1e-5)
    df['water_stress'] = df['temperature'] * (100 - df['humidity'])
    
    return df

df_engineered = add_smart_features(df)

# 4. Prepare X and y
# ==============================================================================
# We DROP 'total_nutrients' to avoid redundancy (multicollinearity)
X = df_engineered.drop(['label', 'total_nutrients'], axis=1)
y = df_engineered['label']

# SAVE FEATURE ORDER (Critical for Deployment)
feature_order = X.columns.tolist()
joblib.dump(feature_order, 'feature_order.pkl')
print(f"‚úÖ Feature Order Saved: {feature_order}")

# 5. Encoding & Splitting
# ==============================================================================
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 6. Scaling
# ==============================================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Data Scaled & Split.")
print(f"   Training Data: {X_train_scaled.shape}")

üß† Engineering Biological Features...
‚úÖ Feature Order Saved: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'N_ratio', 'P_ratio', 'K_ratio', 'aridity_index', 'water_stress']
‚úÖ Data Scaled & Split.
   Training Data: (24424, 12)


In [11]:
# 7. Define Hyperparameter Search Spaces
# ==============================================================================
print("‚öôÔ∏è Configuring Architectures...")

# Random Forest
rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_params = {
    'n_estimators': randint(200, 500),
    'max_depth': [20, 25, 30, None],
    'min_samples_leaf': randint(1, 4),
    'max_features': ['sqrt', 'log2']
}

# KNN
knn_base = KNeighborsClassifier(n_jobs=-1)
knn_params = {
    'n_neighbors': randint(5, 15),
    'weights': ['distance'],
    'p': [1, 2]
}

# XGBoost
xgb_base = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss')
xgb_params = {
    'n_estimators': randint(200, 400),
    'learning_rate': uniform(0.01, 0.15),
    'max_depth': randint(5, 10),
    'subsample': uniform(0.7, 0.3)
}

print("‚úÖ Configuration Ready.")

‚öôÔ∏è Configuring Architectures...
‚úÖ Configuration Ready.


In [12]:
# 8. Training Pipeline
# ==============================================================================
def train_stacking_model(X_train, y_train):
    print("üöÄ Starting Training Pipeline (This takes time)...")
    
    # Phase 1: Optimize Base Models
    print("\nüîé Tuning Random Forest...")
    rf_opt = RandomizedSearchCV(rf_base, rf_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    rf_opt.fit(X_train, y_train)
    
    print("\nüîé Tuning KNN...")
    knn_opt = RandomizedSearchCV(knn_base, knn_params, n_iter=5, cv=3, n_jobs=-1, random_state=42)
    knn_opt.fit(X_train, y_train)
    
    print("\nüîé Tuning XGBoost...")
    xgb_opt = RandomizedSearchCV(xgb_base, xgb_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    xgb_opt.fit(X_train, y_train)
    
    print(f"\n   RF Best: {rf_opt.best_score_:.4f}")
    print(f"   KNN Best: {knn_opt.best_score_:.4f}")
    print(f"   XGB Best: {xgb_opt.best_score_:.4f}")
    
    # Phase 2: Stacking (The "Boss" Model)
    print("\nüèóÔ∏è Training Stacking Meta-Model...")
    stacking_clf = StackingClassifier(
        estimators=[
            ('rf', rf_opt.best_estimator_),
            ('knn', knn_opt.best_estimator_),
            ('xgb', xgb_opt.best_estimator_)
        ],
        final_estimator=LogisticRegression(),
        cv=3,
        n_jobs=1  # <--- CRITICAL FIX FOR WINDOWS CRASH
    )
    
    stacking_clf.fit(X_train, y_train)
    return stacking_clf

# Execute
final_model = train_stacking_model(X_train_scaled, y_train)
print("\n‚úÖ Final Stacking Model Trained.")

üöÄ Starting Training Pipeline (This takes time)...

üîé Tuning Random Forest...

üîé Tuning KNN...

üîé Tuning XGBoost...

   RF Best: 0.9041
   KNN Best: 0.8760
   XGB Best: 0.8997

üèóÔ∏è Training Stacking Meta-Model...

‚úÖ Final Stacking Model Trained.


In [13]:
# 9. Evaluation
# ==============================================================================
print("üìä Evaluating on Test Set...")
y_pred = final_model.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)

print(f"\nüèÜ Final Test Accuracy: {acc*100:.2f}%")
print("-" * 30)
print(classification_report(y_test, y_pred, target_names=le.classes_))

üìä Evaluating on Test Set...

üèÜ Final Test Accuracy: 93.71%
------------------------------
              precision    recall  f1-score   support

      almond       0.83      0.88      0.85        40
       apple       0.96      0.62      0.76        40
     apricot       0.91      0.97      0.94        40
   asparagus       0.97      0.95      0.96        40
      banana       0.99      1.00      1.00       147
      barley       0.95      0.89      0.92       101
    beetroot       0.88      0.90      0.89        40
 bell_pepper       0.86      0.93      0.89        40
 bittergourd       1.00      1.00      1.00        40
  black_gram       0.99      0.99      0.99       102
  blackberry       0.93      0.93      0.93        40
   blueberry       1.00      0.97      0.99        40
 bottlegourd       1.00      1.00      1.00        40
    broccoli       0.80      0.97      0.88        40
     cabbage       0.89      0.80      0.84        40
      carrot       0.88      0.93      

In [14]:
# 10. Save Artifacts
# ==============================================================================
joblib.dump(final_model, 'crop_model_final.pkl')
joblib.dump(scaler, 'scaler_final.pkl')
joblib.dump(le, 'label_encoder_final.pkl')
# feature_order.pkl was saved in Cell 2

print("üíæ Success! All 4 files saved (Model, Scaler, Encoder, FeatureOrder).")

# 11. Test Prediction Function (Deployment Ready)
# ==============================================================================
def test_prediction(N, P, K, temp, hum, ph, rain):
    # Load
    model = joblib.load('crop_model_final.pkl')
    sc = joblib.load('scaler_final.pkl')
    enc = joblib.load('label_encoder_final.pkl')
    cols = joblib.load('feature_order.pkl')
    
    # Create DataFrame
    data = pd.DataFrame([[N, P, K, temp, hum, ph, rain]], 
                        columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])
    
    # Feature Engineering (Must match training!)
    data['total_nutrients'] = data['N'] + data['P'] + data['K'] + 1e-5
    data['N_ratio'] = data['N'] / data['total_nutrients']
    data['P_ratio'] = data['P'] / data['total_nutrients']
    data['K_ratio'] = data['K'] / data['total_nutrients']
    data['aridity_index'] = data['rainfall'] / (data['temperature'] + 1e-5)
    data['water_stress'] = data['temperature'] * (100 - data['humidity'])
    
    # Drop redundant & Enforce Order
    data = data.drop(['total_nutrients'], axis=1)
    data = data[cols]
    
    # Scale & Predict
    scaled = sc.transform(data)
    probs = model.predict_proba(scaled)
    best_idx = np.argmax(probs)
    label = enc.inverse_transform([best_idx])[0]
    conf = probs[0][best_idx] * 100
    
    return label, conf



üíæ Success! All 4 files saved (Model, Scaler, Encoder, FeatureOrder).


In [15]:
# Test
pred, conf = test_prediction(80, 40, 40, 25, 80, 7, 250)
print(f"\nüß™ Test Prediction (Rice Conditions):")
print(f"   Result: {pred} ({conf:.2f}%)")


üß™ Test Prediction (Rice Conditions):
   Result: rice (53.23%)


In [16]:
import joblib
# Load the fat model
print("‚è≥ Loading fat model...")
model = joblib.load('crop_model_final.pkl')

# Re-save with High Compression (Level 3 is a good balance)
print("üíæ Compressing and re-saving...")
joblib.dump(model, 'crop_model_final.pkl', compress=3)

print("‚úÖ Done! Check the file size now.")

‚è≥ Loading fat model...
üíæ Compressing and re-saving...
‚úÖ Done! Check the file size now.
