In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from tensorflow.keras import layers, regularizers

# Load the “ready” CSV you exported in EDA notebook
df = pd.read_csv('../data/loan_data_ready.csv')




In [11]:
# Define features & target from the ready CSV
target_col = 'not.fully.paid'

# Drop raw, untransformed features
df = df.drop(columns=['days.with.cr.line', 'revol.bal'])

# Now pick up every remaining column except the target as a feature
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

print("Features used:", feature_cols)
print("Target balance:\n", y.value_counts(normalize=True))






Features used: ['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'revol_bal_log', 'cr_line_years', 'cr_line_years_log', 'pub_rec_flag', 'inquiry_rate', 'purpose_credit_card', 'purpose_debt_consolidation', 'purpose_educational', 'purpose_home_improvement', 'purpose_major_purchase', 'purpose_small_business']
Target balance:
 not.fully.paid
0    0.839946
1    0.160054
Name: proportion, dtype: float64


In [12]:
from sklearn.model_selection import train_test_split

# Use an 80/20 split, stratified on the target
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=y,
    test_size=0.2,
    random_state=42
)

print("Train target balance:\n", y_train.value_counts(normalize=True))
print("Test  target balance:\n", y_test.value_counts(normalize=True))





Train target balance:
 not.fully.paid
0    0.83999
1    0.16001
Name: proportion, dtype: float64
Test  target balance:
 not.fully.paid
0    0.83977
1    0.16023
Name: proportion, dtype: float64


In [13]:
from imblearn.over_sampling import SMOTE
import numpy as np

# 1) Apply SMOTE on training data only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# 2) Cast to float32 for TensorFlow
X_train_res = X_train_res.astype(np.float32)
y_train_res = y_train_res.astype(np.float32)
X_test      = X_test.astype(np.float32)
y_test      = y_test.astype(np.float32)

print("Resampled train balance:\n", y_train_res.value_counts(normalize=True))
print("Shapes:", X_train_res.shape, X_test.shape)




Resampled train balance:
 not.fully.paid
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Shapes: (12872, 21) (1916, 21)




In [14]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers

def make_model(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(64, activation='relu',
                     kernel_regularizer=regularizers.l2(1e-4)),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu',
                     kernel_regularizer=regularizers.l2(1e-4)),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid'),
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[keras.metrics.AUC(name='auc'),
                 keras.metrics.Recall(name='sensitivity')]
    )
    return model

input_dim = X_train_res.shape[1]
model     = make_model(input_dim)
model.summary()




Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                1408      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3,521
Trainable params: 3,521
Non-trainable params: 0
_________________________________________________________________


In [15]:
history = model.fit(
    X_train_res, y_train_res,
    validation_split=0.2,
    epochs=20,
    batch_size=256
)




Epoch 1/20


2025-05-28 14:58:07.748734: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
loss, auc, sensitivity = model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss={loss:.4f}, AUC={auc:.4f}, Sensitivity={sensitivity:.4f}")



Test loss=0.5141, AUC=0.6587, Sensitivity=0.2866
