In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
from pathlib import Path
import sys
sys.path.append("..")

In [11]:

FINAL_TRAINING_PATH = Path("../data/processed/final_training_data.csv")
MODEL_PATH = Path("../models/")

In [3]:
df = pd.read_csv(FINAL_TRAINING_PATH)

In [4]:
X = df.drop(columns=["CustomerId", "is_high_risk"])
y = df["is_high_risk"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

In [7]:
log_reg_param_grid = {
    "C": [0.1, 1, 10],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

In [8]:
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 20],
    "min_samples_split": [2, 5, 10]
}

In [9]:
metrics = {}

In [12]:
for name, model in models.items():
    print(f"Training {name}...")
    
    if name == "Logistic Regression":
        grid_search = GridSearchCV(model, log_reg_param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    else:
        grid_search = GridSearchCV(model, rf_param_grid, cv=5, scoring="accuracy", n_jobs=-1)

    
    grid_search.fit(X_train, y_train)

    
    best_model = grid_search.best_estimator_

    
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    
    model_metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

    metrics[name] = model_metrics

    
    print(f"Metrics for {name}:")
    for metric, value in model_metrics.items():
        print(f"{metric}: {value:.4f}")

    
    model_filename = MODEL_PATH / f"{name.replace(' ', '_').lower()}.joblib"
    joblib.dump(best_model, model_filename)
    print(f"{name} model saved as {model_filename}")

Training Logistic Regression...




Metrics for Logistic Regression:
accuracy: 0.9279
precision: 0.9394
recall: 0.8671
f1: 0.9018
roc_auc: 0.9590
Logistic Regression model saved as ..\models\logistic_regression.joblib
Training Random Forest...
Metrics for Random Forest:
accuracy: 0.9733
precision: 0.9854
recall: 0.9441
f1: 0.9643
roc_auc: 0.9968
Random Forest model saved as ..\models\random_forest.joblib


In [13]:

metrics_df = pd.DataFrame(metrics)
print("\nAll Metrics:")
print(metrics_df)


All Metrics:
           Logistic Regression  Random Forest
accuracy              0.927904       0.973298
precision             0.939394       0.985401
recall                0.867133       0.944056
f1                    0.901818       0.964286
roc_auc               0.958986       0.996828
