In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

In [10]:
df = pd.read_csv("../data/03_data_selected_features.csv")
X = df.drop("target", axis=1)
y = df["target"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
log_reg = LogisticRegression(max_iter=5000, solver="liblinear")

param_grid_log = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"]
}

grid_log = GridSearchCV(log_reg, param_grid_log, cv=5, scoring="roc_auc")
grid_log.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_log.best_params_)
print("Best Logistic Regression AUC:", grid_log.best_score_)

Best Logistic Regression Params: {'C': 10, 'penalty': 'l1'}
Best Logistic Regression AUC: 0.8959139893922503


In [13]:
rf = RandomForestClassifier(random_state=42)

param_dist_rf = {
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rand_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf, 
    n_iter=20, cv=5, scoring="roc_auc", random_state=42, n_jobs=-1
)
rand_rf.fit(X_train, y_train)

print("Best Random Forest Params:", rand_rf.best_params_)
print("Best Random Forest AUC:", rand_rf.best_score_)


Best Random Forest Params: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5, 'bootstrap': True}
Best Random Forest AUC: 0.896495050842877


In [14]:
best_log = grid_log.best_estimator_
best_rf = rand_rf.best_estimator_

models = {"Logistic Regression (tuned)": best_log, "Random Forest (tuned)": best_rf}

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    print(f"\n{name} Performance on Test Set:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Logistic Regression (tuned) Performance on Test Set:
Accuracy: 0.8524590163934426
Precision: 0.8064516129032258
Recall: 0.8928571428571429
F1: 0.847457627118644
ROC-AUC: 0.9318181818181819

Random Forest (tuned) Performance on Test Set:
Accuracy: 0.9016393442622951
Precision: 0.8666666666666667
Recall: 0.9285714285714286
F1: 0.896551724137931
ROC-AUC: 0.9545454545454546


In [15]:
final_model = rand_rf.best_estimator_
joblib.dump(final_model, "../models/final_model.pkl")

print("✅ Final model saved as models/final_model.pkl")


✅ Final model saved as models/final_model.pkl
