In [2]:
import pandas as pd
import numpy as np
import joblib
import warnings

# Sklearn Imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint, uniform
import xgboost as xgb

# Suppress warnings
warnings.filterwarnings('ignore')

# 1. Load Data
# ==============================================================================
file_path = 'fertlizer_recommendation_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print(f"‚úÖ Data Loaded. Shape: {df.shape}")
    print(f"   Columns: {df.columns.tolist()}")
except FileNotFoundError:
    print(f"‚ùå CRITICAL ERROR: '{file_path}' not found.")
    raise

‚úÖ Data Loaded. Shape: (5410, 11)
   Columns: ['Temperature', 'Moisture', 'Rainfall', 'PH', 'Nitrogen', 'Phosphorous', 'Potassium', 'Carbon', 'Soil', 'Crop', 'Fertilizer']


In [3]:
# 2. Encoding (Text -> Numbers)
# ==============================================================================
encoders = {}
print("‚öôÔ∏è Encoding Labels...")

# Encode Features (Using correct column names 'Soil' and 'Crop')
for col in ['Soil', 'Crop']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    print(f"   ‚úÖ Encoded {col}: {len(le.classes_)} classes")

# Encode Target (Fertilizer)
le_target = LabelEncoder()
df['Fertilizer'] = le_target.fit_transform(df['Fertilizer'])
encoders['Target'] = le_target
print(f"   ‚úÖ Encoded Target: {len(le_target.classes_)} unique fertilizers")

# 3. Feature Setup & Ordering
# ==============================================================================
# Defining X with the exact column headers from your file
# Note: Your file has 'Moisture' and 'Rainfall', but NO 'Humidity'.
X = df[['Temperature', 'Moisture', 'Rainfall', 'PH', 'Soil', 'Crop', 'Nitrogen', 'Potassium', 'Phosphorous', 'Carbon']]
y = df['Fertilizer']

# SAVE FEATURE ORDER (Crucial for API stability)
feature_order = X.columns.tolist()
print(f"üîí Feature Order Locked: {feature_order}")

# 4. Train/Test Split
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"‚úÖ Data Split. Training on {len(X_train)} samples.")

# 5. Scaling (StandardScaler)
# ==============================================================================
print("‚öñÔ∏è Scaling Data...")
scaler = StandardScaler()

# Fit on training data ONLY
X_train_scaled = scaler.fit_transform(X_train)
# Transform test data
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Data Scaled.")

‚öôÔ∏è Encoding Labels...
   ‚úÖ Encoded Soil: 5 classes
   ‚úÖ Encoded Crop: 26 classes
   ‚úÖ Encoded Target: 10 unique fertilizers
üîí Feature Order Locked: ['Temperature', 'Moisture', 'Rainfall', 'PH', 'Soil', 'Crop', 'Nitrogen', 'Potassium', 'Phosphorous', 'Carbon']
‚úÖ Data Split. Training on 4328 samples.
‚öñÔ∏è Scaling Data...
‚úÖ Data Scaled.


In [4]:
# ==============================================================================
# 6. Hyperparameter Tuning Setup (RF + XGB + KNN)
# ==============================================================================
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint, uniform
import xgboost as xgb # Ensure you have 'pip install xgboost' run separately!

print("‚öôÔ∏è Configuring Model Architectures for 3-Model Ensemble...")

# ==============================================================================
# 6. Hyperparameter Tuning Setup (RF + KNN + XGB)
# ==============================================================================


# 1. Random Forest (RF)
rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_params = {
    'n_estimators': randint(150, 300),
    'max_depth': [15, 20, 25, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

# 2. K-Nearest Neighbors (KNN)
knn_base = KNeighborsClassifier(n_jobs=-1)
knn_params = {
    'n_neighbors': randint(3, 15),
    'weights': ['distance'], 
    'p': [1, 2]              
}

# 3. XGBoost (XGB)
xgb_base = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss')
xgb_params = {
    'n_estimators': randint(150, 300),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.7, 0.3)
}

# Dictionary to hold the search objects
model_searches = {
    'rf': RandomizedSearchCV(rf_base, rf_params, n_iter=15, cv=3, scoring='accuracy', n_jobs=-1, random_state=42),
    'knn': RandomizedSearchCV(knn_base, knn_params, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42),
    'xgb': RandomizedSearchCV(xgb_base, xgb_params, n_iter=15, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
}

print("‚úÖ Tuning Configuration Ready.")

‚öôÔ∏è Configuring Model Architectures for 3-Model Ensemble...
‚úÖ Tuning Configuration Ready.


In [None]:
# ==============================================================================
# 7. Train Optimized Ensemble
# ==============================================================================
def train_and_assemble(model_searches, X_train_scaled, y_train):
    print("üöÄ Starting 3-Model Training Pipeline...")
    
    best_estimators = []
    
    # Phase 1: Tuning
    for name, search in model_searches.items():
        print(f"\nüîé Tuning {name.upper()}...")
        search.fit(X_train_scaled, y_train)
        print(f"   Best Score: {search.best_score_:.4f}")
        best_estimators.append((name, search.best_estimator_))
        
    # Phase 2: Ensemble Training
    print("\nüèóÔ∏è Assembling Final Voting Ensemble...")
    
    # Soft Voting: Averages the probabilities from RF, KNN, and XGB
    ensemble = VotingClassifier(
        estimators=best_estimators,
        voting='soft', 
        n_jobs=1  # CRITICAL: Set to 1 to prevent Windows crash
    )
    
    # Fit the Ensemble ONCE
    ensemble.fit(X_train_scaled, y_train)
    
    return ensemble

# Execute
final_model = train_and_assemble(model_searches, X_train_scaled, y_train)
print("\n‚úÖ Final 3-Model Ensemble Trained.")

üöÄ Starting 3-Model Training Pipeline...

üîé Tuning RF...
   Best Score: 0.9353

üîé Tuning KNN...
   Best Score: 0.8341

üîé Tuning XGB...
   Best Score: 0.9360

üèóÔ∏è Assembling Final Voting Ensemble...

‚úÖ Final 3-Model Ensemble Trained.


In [8]:
# 8. Evaluation
# ==============================================================================
print("üìä Final Evaluation...")
preds = final_model.predict(X_test_scaled)
acc = accuracy_score(y_test, preds)

print(f"\nüèÜ Final Test Accuracy: {acc*100:.2f}%")
target_names = encoders['Target'].classes_.astype(str)
print(classification_report(y_test, preds, target_names=target_names))




üìä Final Evaluation...

üèÜ Final Test Accuracy: 95.75%
                            precision    recall  f1-score   support

   Balanced Npk Fertilizer       0.98      0.98      0.98        57
                   Compost       0.99      0.98      0.99       108
                       Dap       0.97      0.99      0.98       376
General Purpose Fertilizer       0.75      0.82      0.78        11
                    Gypsum       0.95      0.90      0.93        21
                      Lime       0.81      0.87      0.84        54
         Muriate Of Potash       0.97      0.96      0.97       106
        Organic Fertilizer       0.88      0.92      0.90        38
                      Urea       0.92      0.89      0.90        62
Water Retaining Fertilizer       0.98      0.95      0.96       249

                  accuracy                           0.96      1082
                 macro avg       0.92      0.93      0.92      1082
              weighted avg       0.96      0.96      0.

In [9]:
# 9. Save Artifacts (The Deployment Package)
# ==============================================================================
artifacts = {
    'model': final_model,
    'encoders': encoders,
    'scaler': scaler,          
    'feature_order': feature_order 
}

joblib.dump(artifacts, 'fertilizer_model_final.pkl', compress=3)
print("\nüíæ Success! All artifacts saved to 'fertilizer_model_final.pkl'")


üíæ Success! All artifacts saved to 'fertilizer_model_final.pkl'
