In [35]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [36]:
X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/y_test.csv").values.ravel()

In [37]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=42)
}

In [38]:
best_model = None
best_score = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    print(f"{name} ROC-AUC: {score:.4f}")
    if score > best_score:
        best_score = score
        best_model = model

LogisticRegression ROC-AUC: 0.9659
RandomForest ROC-AUC: 0.9977
GradientBoosting ROC-AUC: 0.9988


In [39]:
joblib.dump(best_model, "../models/best_model.pkl")
print(f"Best model saved: {best_model.__class__.__name__}")

Best model saved: GradientBoostingClassifier
