In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import joblib

In [26]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [27]:
import warnings
warnings.filterwarnings("ignore")

In [28]:
db = pd.read_csv('datasets/Crop_recommendation.csv')
db.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [29]:
db.isna().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [30]:
X = db.drop(columns=['label'])
y = db['label']

In [31]:
le = LabelEncoder()
y = le.fit_transform(y)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500, multi_class="multinomial"),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "MLP": MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42)
}

In [40]:
selection_metric = "F1-Score"

scaler_models = {"Logistic Regression", "KNN", "MLP"}

best_score = -1.0
best_model = None
best_name = None
best_metrics = None
best_needs_scaling = False

for name, model in models.items():
    X_tr = X_train_scaled if name in scaler_models else X_train
    X_te = X_test_scaled if name in scaler_models else X_test

    model.fit(X_tr, y_train)
    preds = model.predict(X_te)

    acc = accuracy_score(y_test, preds)
    pre = precision_score(y_test, preds, average='weighted', zero_division=0)
    rec = recall_score(y_test, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_test, preds, average='weighted', zero_division=0)

    if selection_metric == "Composite":
        # example composite score. Adjust weights if needed.
        score = 0.5 * f1 + 0.25 * acc + 0.25 * rec
    else:
        score = {"Accuracy": acc, "Precision": pre, "Recall": rec, "F1-Score": f1}[selection_metric]

    # tie-breaker: prefer higher F1 when scores equal
    if (score > best_score) or (score == best_score and f1 > (best_metrics or {}).get("F1-Score", -1)):
        best_score = score
        best_model = model
        best_name = name
        best_metrics = {"Accuracy": acc, "Precision": pre, "Recall": rec, "F1-Score": f1}
        best_needs_scaling = name in scaler_models

# results
print(f"Selected best model: {best_name} (score={best_score:.4f})")
print(pd.DataFrame(best_metrics, index=[best_name]).T)

# save bundle (model + meta) and encoder/scaler
joblib.dump({"model": best_model, "name": best_name, "needs_scaling": best_needs_scaling, "metrics": best_metrics},
            "models/best_model_bundle.pkl")
joblib.dump(le, "models/label_encoder.pkl")
joblib.dump(scaler, "models/scaler.pkl")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1331
[LightGBM] [Info] Number of data points in the train set: 1760, number of used features: 7
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] 

['models/scaler.pkl']