In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

In [9]:
train_data = pd.read_csv('./cs-training.csv')
test_data = pd.read_csv('./cs-test.csv')

In [10]:
imputer = SimpleImputer(strategy = 'median')
train_data_imputed = pd.DataFrame(imputer.fit_transform(train_data))
train_data_imputed.columns = train_data.columns

In [11]:
scaler = StandardScaler()
X_train = train_data_imputed.drop('SeriousDlqin2yrs', axis=1)
y_train = train_data_imputed['SeriousDlqin2yrs']
X_train_scaled = scaler.fit_transform(X_train)

In [12]:
smote = SMOTE()
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [13]:
model = RandomForestClassifier(n_estimators = 100, random_state = 42)
model.fit(X_train_res,y_train_res)

In [14]:
# Step 5: Model Evaluation
y_train_pred = model.predict_proba(X_train_scaled)[:, 1]
print('Training ROC AUC Score:', roc_auc_score(y_train, y_train_pred))

Training ROC AUC Score: 1.0


In [15]:
# Confusion Matrix and Classification Report
y_train_pred_class = model.predict(X_train_scaled)
print(confusion_matrix(y_train, y_train_pred_class))
print(classification_report(y_train, y_train_pred_class))

[[139974      0]
 [     0  10026]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    139974
         1.0       1.00      1.00      1.00     10026

    accuracy                           1.00    150000
   macro avg       1.00      1.00      1.00    150000
weighted avg       1.00      1.00      1.00    150000

