# Predict credit default

- Dataset: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
- Inspiration for model: https://www.kaggle.com/code/mahyar511/payment-default-prediction-neural-network

# 1. Setup

In [1]:
# ! conda install keras -y
# ! conda install tensorflow -y

In [2]:
# ! conda install xlrd -y

In [3]:
# imports
import pandas as pd
import numpy as np

from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical


In [4]:
# get the dataset from UCI ML Repository
# ! curl -o default.xls https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls

In [5]:
# load the dataset
df = pd.read_excel('data/default.xls', header=1)
df.shape

(30000, 25)

In [6]:
# examine
df.head(2)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1


In [7]:
# distribution of the target (credit default)
print(df['default payment next month'].dtypes)
df['default payment next month'].value_counts()

int64


0    23364
1     6636
Name: default payment next month, dtype: int64

In [8]:
# Calculating default Ratio
non_default = len(df[df['default payment next month']==0])
default = len(df[df['default payment next month']==1])
ratio = float(default/(non_default+default))
print('Default Ratio: ', ratio)

Default Ratio:  0.2212


## Feature engineering

In [9]:
# split into input (X) and output (y) variables
X = df.drop(['ID', 'default payment next month'], axis=1)
print(X.shape[1])

23


In [10]:
# y = to_categorical(df['default payment next month'])
y = to_categorical(df['default payment next month'])
y[:5]

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [11]:
print(X.shape, y.shape)

(30000, 23) (30000, 2)


## Train test split

In [12]:
# split into 67% for train and 33% for test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
# Standardize the predictors
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
# how many features are there? (this will be the input dimensions)
print(X_train.shape)
print(y_train.shape)

(20100, 23)
(20100, 2)


In [15]:
# how many in each category now?
print('y_train')
unique, counts = np.unique(y_train, return_counts=True)
for x in [0,1]:
    print(unique[x],counts[x])
print('y_test')
unique, counts = np.unique(y_test, return_counts=True)
for x in [0,1]:
    print(unique[x],counts[x])


y_train
0.0 20100
1.0 20100
y_test
0.0 9900
1.0 9900


# 2. Define Keras Model

In [16]:
# parameters
n_cols = X_train.shape[1]
early_stopping_monitor = EarlyStopping(patience=2)
class_weight = {0:ratio, 1:1-ratio}
print(n_cols)

23


In [17]:
# keras has two APIs: Sequential and Functional.
model = Sequential()

2022-09-01 21:47:08.179124: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
# input layer and first hidden layer
# relu = rectified linear unit
model.add(Dense(12, activation='relu', 
                input_shape = (n_cols,))
         )

In [19]:
# second hidden layer
model.add(Dense(12, activation='relu'))

In [20]:
# output layer
model.add(Dense(2, activation='softmax'))

# 3. Compile Keras Model

In [21]:
# compile the keras model
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']
             )

# 4. Fit the model

In [40]:
# fit the keras model on the dataset

model.fit(X_train, 
          y_train, 
          validation_data=(X_test,y_test), 
          epochs=150, 
          batch_size=10,
          class_weight=class_weight,
          callbacks = [early_stopping_monitor],
          verbose = 0
         )

<tensorflow.python.keras.callbacks.History at 0x7f68395ea850>

# 5. Evaluate

In [41]:
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 73.61


In [42]:
# make probability predictions with the model (they come in pairs)
y_probs = model.predict(X_test)
y_probs[:10]

array([[0.58149236, 0.4185076 ],
       [0.8068527 , 0.19314726],
       [0.6559563 , 0.3440437 ],
       [0.65173787, 0.3482621 ],
       [0.4241021 , 0.57589793],
       [0.53040284, 0.46959725],
       [0.3386971 , 0.66130286],
       [0.4001633 , 0.5998367 ],
       [0.7540156 , 0.2459843 ],
       [0.6246552 , 0.3753448 ]], dtype=float32)

In [43]:
# express these as booleans
y_probs > 0.7

array([[False, False],
       [ True, False],
       [False, False],
       ...,
       [ True, False],
       [False, False],
       [ True, False]])

In [44]:
# make class predictions with the model
y_preds = (y_probs > 0.5).astype(int)
y_preds[:10]

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0]])

In [45]:
# check out the first ten
print(y_preds[:10].tolist())
print(y_test[:10].tolist())

[[1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0], [1, 0]]
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]


In [46]:

# Evaluate the model
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.89      0.76      0.82      7742
           1       0.43      0.65      0.52      2158

   micro avg       0.74      0.74      0.74      9900
   macro avg       0.66      0.71      0.67      9900
weighted avg       0.79      0.74      0.75      9900
 samples avg       0.74      0.74      0.74      9900



# 6. Make Predictions on new data


In [47]:
new_data = X_test[[0]]
model.predict(new_data)

array([[0.58149236, 0.41850758]], dtype=float32)

# 7. Save the model
https://machinelearningmastery.com/save-load-keras-deep-learning-models/

In [30]:
model.save("credit-model-1.h5")

In [31]:
# load model
from keras.models import load_model
model2 = load_model('credit-model-1.h5')

In [32]:
# make class predictions with the model
predictions = (model.predict(X) > 0.5).astype(int)
predictions[:10]

array([[0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0]])