In [20]:
# Cell 1 - Imports
import pandas as pd
import numpy as np
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [21]:
# Cell 2 - Load preprocessed data
X_train = pd.read_csv("../data/X_train_scaled.csv")
X_test = pd.read_csv("../data/X_test_scaled.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

print("✅ Data loaded successfully")
print("   X_train:", X_train.shape)
print("   X_test:", X_test.shape)


✅ Data loaded successfully
   X_train: (4000, 29)
   X_test: (1000, 29)


In [22]:
# Cell 3 - Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}


In [23]:
# Cell 4 - Train & evaluate models
results = []

for name, model in models.items():
    print(f"🚀 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1
    })

    print(f"✅ {name} done. F1-score: {f1:.4f}")


🚀 Training Logistic Regression...
✅ Logistic Regression done. F1-score: 0.8895
🚀 Training Random Forest...
✅ Random Forest done. F1-score: 0.9831
🚀 Training Gradient Boosting...
✅ Gradient Boosting done. F1-score: 0.9871
🚀 Training XGBoost...
✅ XGBoost done. F1-score: 0.9960


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [24]:
# Cell 5 - Compare results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-score", ascending=False)
results_df.reset_index(drop=True, inplace=True)

print("\n📊 Model Comparison:")
display(results_df)



📊 Model Comparison:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,XGBoost,0.996,1.0,0.992048,0.996008
1,Gradient Boosting,0.987,0.988048,0.986083,0.987065
2,Random Forest,0.983,0.986,0.980119,0.983051
3,Logistic Regression,0.887,0.875,0.904573,0.889541


In [25]:
# Cell 6 - Save the best model
best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]

joblib.dump(best_model, "../models/best_model.pkl")

print(f"🏆 Best model: {best_model_name} saved at ../models/best_model.pkl")


🏆 Best model: XGBoost saved at ../models/best_model.pkl


In [26]:
# Cell 7 - Detailed classification report for best model
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

print("\n📑 Classification Report:")
print(classification_report(y_test, y_pred_best))



📑 Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       497
           1       1.00      0.99      1.00       503

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
