In [2]:
#Import Libraries & Load Split Files

import pandas as pd

X_train = pd.read_csv("data/X_train.csv")
X_val = pd.read_csv("data/X_val.csv")
X_test = pd.read_csv("data/X_test.csv")

y_train = pd.read_csv("data/y_train.csv")
y_val = pd.read_csv("data/y_val.csv")
y_test = pd.read_csv("data/y_test.csv")


In [3]:
X_train.shape, X_val.shape, X_test.shape


((13999, 18), (3000, 18), (3000, 18))

In [4]:
#Identify Numeric & Categorical Columns

num_cols = X_train.select_dtypes(include='number').columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

num_cols, cat_cols


(['person_age',
  'person_income',
  'person_emp_exp',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length',
  'credit_score',
  'previous_loan_defaults_on_file',
  'monthly_income',
  'monthly_payment_est',
  'payment_to_income'],
 ['person_gender',
  'person_education',
  'person_home_ownership',
  'loan_intent',
  'age_bucket',
  'credit_bucket'])

In [5]:
#Create Preprocessing Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [6]:
#Build Baseline Logistic Regression Model

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_reg_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])


In [7]:
log_reg_model.fit(X_train, y_train.values.ravel())


In [8]:
#Evaluate Logistic Regression

from sklearn.metrics import classification_report, roc_auc_score

y_pred = log_reg_model.predict(X_val)
y_proba = log_reg_model.predict_proba(X_val)[:,1]

print(classification_report(y_val, y_pred))
print("AUC:", roc_auc_score(y_val, y_proba))


              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2341
           1       0.78      0.78      0.78       659

    accuracy                           0.91      3000
   macro avg       0.86      0.86      0.86      3000
weighted avg       0.91      0.91      0.91      3000

AUC: 0.9607699133802073


In [9]:
#Build a Random Forest Model (Tree-Based)

from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])


In [10]:
rf_model.fit(X_train, y_train.values.ravel())

In [11]:
y_pred_rf = rf_model.predict(X_val)
y_proba_rf = rf_model.predict_proba(X_val)[:,1]

print(classification_report(y_val, y_pred_rf))
print("AUC:", roc_auc_score(y_val, y_proba_rf))


              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2341
           1       0.91      0.81      0.86       659

    accuracy                           0.94      3000
   macro avg       0.93      0.89      0.91      3000
weighted avg       0.94      0.94      0.94      3000

AUC: 0.976716109673894


In [None]:
| Model               | AUC | Accuracy | Recall | Precision |
| ------------------- | --- | -------- | ------ | --------- |
| Logistic Regression | 0.961 | 0.91   | 0.91   | 0.91      |
| Random Forest       | 0.977 | 0.94   | 0.94   | 0.94      |


In [13]:
#save best model

import joblib
joblib.dump(rf_model, "models/day4_best_model.pkl")


['models/day4_best_model.pkl']

In [16]:
import os
os.listdir("models")


['day4_best_model.pkl', 'log_reg_model.pkl', 'rf_model.pkl']

In [15]:
import joblib
joblib.dump(rf_model, "models/rf_model.pkl")
joblib.dump(log_reg_model, "models/log_reg_model.pkl")

['models/log_reg_model.pkl']