In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

In [2]:
# --- Configuration ---
MODEL_FILE = 'fertilizer_model.pkl'
ENCODER_FILE = 'label_encoder.pkl'
CATEGORIES_FILE = 'categories.pkl'

In [3]:
# --- 1. Load Data ---
try:
    df = pd.read_csv("D:\\soilproject\\dataset\\data_core.csv")
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: 'data_core.csv' not found. Please ensure the file is in the correct directory.")
    exit()

Data loaded successfully.


In [4]:
# Rename columns for simpler access
df.columns = df.columns.str.strip().str.replace(' ', '_')

In [5]:
# --- 2. Data Preprocessing and Feature Engineering ---
X = df.drop('Fertilizer_Name', axis=1)
y_raw = df['Fertilizer_Name']

In [6]:
# 2a. Encode the target variable (Fertilizer Name)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
target_names = label_encoder.classes_

In [7]:
print(f"\nUnique Fertilizer Types: {target_names}")
print(f"Total samples: {len(df)}")


Unique Fertilizer Types: ['10-26-26' '14-35-14' '17-17-17' '20-20' '28-28' 'DAP' 'Urea']
Total samples: 8000


In [8]:
# 2b. Define features
numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
categorical_features = ['Soil_Type', 'Crop_Type']

In [9]:
# Extract unique categories for Flask App
soil_types = df['Soil_Type'].unique().tolist()
crop_types = df['Crop_Type'].unique().tolist()
print(f"Unique Soil Types: {soil_types}")
print(f"Unique Crop Types: {crop_types}")

Unique Soil Types: ['Sandy', 'Loamy', 'Black', 'Red', 'Clayey']
Unique Crop Types: ['Maize', 'Sugarcane', 'Cotton', 'Tobacco', 'Paddy', 'Barley', 'Wheat', 'Millets', 'Oil seeds', 'Pulses', 'Ground Nuts']


In [10]:
# 2c. Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

In [11]:
# --- 3. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

Training set size: 6400 samples
Testing set size: 1600 samples


In [12]:
# --- 4. Model Selection and Hyperparameter Tuning ---
best_model = None
best_accuracy = 0.0
best_model_name = ""

In [13]:
# Define models and their hyperparameter grids
models_to_tune = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'param_grid': {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [10, 20, None],
            'classifier__min_samples_split': [2, 5],
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'param_grid': {
            'classifier__n_estimators': [100, 200],
            'classifier__learning_rate': [0.05, 0.1, 0.2],
            'classifier__max_depth': [3, 5],
        }
    }
}

In [14]:
print("\nStarting Hyperparameter Tuning and Model Comparison...")


Starting Hyperparameter Tuning and Model Comparison...


In [15]:
for name, config in models_to_tune.items():
    print(f"\n--- Tuning {name} ---")

    # Create a pipeline specific to the current model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', config['model'])
    ])

    # Use GridSearchCV for hyperparameter optimization
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=config['param_grid'],
        cv=5, # 5-fold cross-validation
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    # Evaluate the best model found by the grid search on the test set
    tuned_pipeline = grid_search.best_estimator_
    y_pred = tuned_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\n{name} Best Parameters: {grid_search.best_params_}")
    print(f"{name} Test Accuracy: {accuracy:.4f}")

    # Check if this is the best model so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = tuned_pipeline
        best_model_name = name
        print(f"*** {name} is the new best model! ***")


--- Tuning RandomForest ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits

RandomForest Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
RandomForest Test Accuracy: 0.1419
*** RandomForest is the new best model! ***

--- Tuning GradientBoosting ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits

GradientBoosting Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
GradientBoosting Test Accuracy: 0.1369


In [16]:
# --- 5. Final Evaluation and Conclusion ---
print("\n--- Model Selection Complete ---")
print(f"The best performing model is: {best_model_name} with an Accuracy of {best_accuracy:.4f}")

if best_accuracy > 0.90:
    print("\n✅ Success! Target accuracy of >90% achieved!")
else:
    print(f"\n⚠️ Target accuracy of >90% not met. Current best is {best_accuracy:.4f}. More data or feature engineering may be required.")


--- Model Selection Complete ---
The best performing model is: RandomForest with an Accuracy of 0.1419

⚠️ Target accuracy of >90% not met. Current best is 0.1419. More data or feature engineering may be required.


In [17]:
# Print detailed report for the best model
y_pred_final = best_model.predict(X_test)
print("\nClassification Report for the Best Model:")
print(classification_report(y_test, y_pred_final, target_names=target_names))


Classification Report for the Best Model:
              precision    recall  f1-score   support

    10-26-26       0.16      0.15      0.15       226
    14-35-14       0.14      0.16      0.14       238
    17-17-17       0.12      0.12      0.12       225
       20-20       0.15      0.14      0.15       220
       28-28       0.15      0.15      0.15       224
         DAP       0.15      0.13      0.14       233
        Urea       0.12      0.14      0.13       234

    accuracy                           0.14      1600
   macro avg       0.14      0.14      0.14      1600
weighted avg       0.14      0.14      0.14      1600



In [18]:
# --- 6. Save Best Model and Encoder ---
joblib.dump(best_model, MODEL_FILE)
joblib.dump(label_encoder, ENCODER_FILE)

['label_encoder.pkl']

In [19]:
# Save categories for the Flask App
categories_data = {
    'soil_types': soil_types,
    'crop_types': crop_types
}
joblib.dump(categories_data, CATEGORIES_FILE)

print(f"\nBest Model ({best_model_name}) saved to: {MODEL_FILE}")
print(f"Label Encoder saved to: {ENCODER_FILE}")
print(f"Categories saved to: {CATEGORIES_FILE}")


Best Model (RandomForest) saved to: fertilizer_model.pkl
Label Encoder saved to: label_encoder.pkl
Categories saved to: categories.pkl
