In [1]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Read the undersampled dataset from 'uci_undersampled.xlsx'
data = pd.read_excel("uci_undersampled.xlsx", engine='openpyxl')


In [2]:
data = data.drop('SEX', axis=1)

# Splitting the data into features (X) and target (y)
X = data.drop("default", axis=1)
y = data["default"]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
log_reg = LogisticRegression(max_iter=100)
start_time = time.time()
log_reg.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
y_pred = log_reg.predict(X_test)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)


print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')  


Accuracy:  0.6103500761035008
Precision:  0.6078286558345642
Recall:  0.6253799392097265
F1 Score:  0.6164794007490637
AUC:  0.6416240779524057
  Elapsed Time: 0.05 seconds


In [4]:
grad_boost = GradientBoostingClassifier()
start_time = time.time()
grad_boost.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_grad_boost = grad_boost.predict(X_test)
y_pred_proba_grad_boost = grad_boost.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
accuracy_grad_boost = accuracy_score(y_test, y_pred_grad_boost)
precision_grad_boost = precision_score(y_test, y_pred_grad_boost)
recall_grad_boost = recall_score(y_test, y_pred_grad_boost)
f1_grad_boost = f1_score(y_test, y_pred_grad_boost)
auc_grad_boost = roc_auc_score(y_test, y_pred_proba_grad_boost)

print("Gradient Boosting accuracy: ", accuracy_grad_boost)
print("Gradient Boosting precision: ", precision_grad_boost)
print("Gradient Boosting recall: ", recall_grad_boost)
print("Gradient Boosting F1 Score: ", f1_grad_boost)
print("Gradient Boosting AUC: ", auc_grad_boost)
print(f"Gradient Boosting Elapsed Time: {elapsed_time:.2f} seconds")


Gradient Boosting accuracy:  0.7039573820395738
Gradient Boosting precision:  0.740608228980322
Gradient Boosting recall:  0.6291793313069909
Gradient Boosting F1 Score:  0.6803615447822514
Gradient Boosting AUC:  0.7703670004262733
Gradient Boosting Elapsed Time: 2.79 seconds


In [5]:
# Convert the target variable to categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Build the deep learning model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time

# Predict on the test set
y_pred_dl = np.argmax(model.predict(X_test), axis=-1)

# Calculate evaluation metrics
accuracy_dl = accuracy_score(y_test, y_pred_dl)
precision_dl = precision_score(y_test, y_pred_dl)
recall_dl = recall_score(y_test, y_pred_dl)
f1_dl = f1_score(y_test, y_pred_dl)

# Calculate AUC
y_pred_proba_dl = model.predict(X_test)[:, 1]
auc_dl = roc_auc_score(y_test, y_pred_proba_dl)

print("Deep Learning accuracy: ", accuracy_dl)
print("Deep Learning precision: ", precision_dl)
print("Deep Learning recall: ", recall_dl)
print("Deep Learning F1 Score: ", f1_dl)
print("Deep Learning AUC: ", auc_dl)
print(f"Deep Learning Elapsed Time: {elapsed_time:.2f} seconds")


Metal device set to: Apple M2
Epoch 1/10
  1/329 [..............................] - ETA: 1:03 - loss: 11043.4277 - accuracy: 0.5625

2023-05-05 04:08:00.649923: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning accuracy:  0.534627092846271
Deep Learning precision:  0.694560669456067
Deep Learning recall:  0.12613981762917933
Deep Learning F1 Score:  0.2135048231511254
Deep Learning AUC:  0.5666329972570242
Deep Learning Elapsed Time: 16.80 seconds


In [6]:
svc = SVC(kernel='rbf', probability=True)
start_time = time.time()
svc.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_svc = svc.predict(X_test)

# Calculate evaluation metrics
accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)

# Calculate AUC
y_pred_proba_svc = svc.predict_proba(X_test)[:, 1]
auc_svc = roc_auc_score(y_test, y_pred_proba_svc)

print("Support Vector Machine accuracy: ", accuracy_svc)
print("Support Vector Machine precision: ", precision_svc)
print("Support Vector Machine recall: ", recall_svc)
print("Support Vector Machine F1 Score: ", f1_svc)
print("Support Vector Machine AUC: ", auc_svc)
print(f"Support Vector Machine Elapsed Time: {elapsed_time:.2f} seconds")


Support Vector Machine accuracy:  0.610730593607306
Support Vector Machine precision:  0.5993220338983051
Support Vector Machine recall:  0.6717325227963525
Support Vector Machine F1 Score:  0.6334647079899677
Support Vector Machine AUC:  0.660510995070057
Support Vector Machine Elapsed Time: 16.75 seconds


In [52]:
df = pd.read_excel("uci_undersampled.xlsx", engine='openpyxl')

# Custom function to handle cases where the number of unique quantiles is less than the desired number of bins
def custom_qcut(series, num_bins, labels=None):
    quantiles = series.quantile([i / num_bins for i in range(1, num_bins)])
    unique_quantiles = quantiles.unique()
    if labels is None:
        labels = range(1, len(unique_quantiles) + 2)
    return pd.cut(series, bins=[-float('inf'), *unique_quantiles, float('inf')], labels=labels)

# Assuming your data is in a pandas DataFrame called 'df'
categorical_variables = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'default']
continuous_variables = ['LIMIT_BAL','AGE','BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

# Number of bins you want to create
num_bins = 10

# Convert continuous variables into categorical variables based on percentiles and store them in a new DataFrame
cat_df = df[categorical_variables].copy()
for var in continuous_variables:
    cat_df[var] = custom_qcut(df[var], num_bins)

# Display the updated DataFrame with the new categorical variables
print(cat_df.head())


   SEX  EDUCATION  MARRIAGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6   
0    2          2         1      2      2      2      0      0      0  \
1    2          1         2      2      2      2      2      2      2   
2    2          1         2     -1     -1     -1      0     -1     -1   
3    2          1         2     -1     -1     -2     -2     -2     -2   
4    2          1         1      0      0      0      0      0     -1   

   default  ... BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2   
0        1  ...         5         5         6         6        4        1  \
1        1  ...        10        10        10        10        8        8   
2        0  ...         4         4         4         4        7        8   
3        1  ...         1         1         1         1        2        1   
4        0  ...         5         6         3         4        4        6   

  PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6  
0        5        3        3        3  
1        8        

In [53]:
def calculate_woe_iv(data, feature, target):
    lst = []
    for i in range(data[feature].nunique()):
        val = list(data[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': data[data[feature] == val].count()[feature],
            'Good': data[(data[feature] == val) & (data[target] == 0)].count()[feature],
            'Bad': data[(data[feature] == val) & (data[target] == 1)].count()[feature]
        })

    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()

    return dset, iv


In [54]:
print(cat_df.head())
cat_df['PAY_0_merged'] = cat_df['PAY_0'].apply(lambda x: x if x < 7 else 7)
cat_df['PAY_3_merged'] = cat_df['PAY_3'].apply(lambda x: x if x < 7 else 7)
cat_df['PAY_4_merged'] = cat_df['PAY_4'].apply(lambda x: x if x < 6 else 6)
cat_df['PAY_5_merged'] = cat_df['PAY_5'].apply(lambda x: x if x < 6 else 6)
cat_df['PAY_6_merged'] = cat_df['PAY_6'].apply(lambda x: x if x < 7 else 7)
cat_df['EDUCATION_merged'] = cat_df['EDUCATION'].apply(lambda x: x if x > 1 else 1)
features = ['SEX', 'EDUCATION','EDUCATION_merged', 'MARRIAGE', 'AGE','PAY_0','PAY_0_merged', 'PAY_2', 'PAY_3','PAY_3_merged', 'PAY_4','PAY_4_merged', 'PAY_5','PAY_5_merged', 'PAY_6','PAY_6_merged', 'LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
for feature in features:
    print(f"Calculating WoE and IV for {feature}")
    dset, iv = calculate_woe_iv(cat_df, feature, 'default')
    print(dset)
    print(f"IV score: {iv}\n")
print(cat_df.head())

   SEX  EDUCATION  MARRIAGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6   
0    2          2         1      2      2      2      0      0      0  \
1    2          1         2      2      2      2      2      2      2   
2    2          1         2     -1     -1     -1      0     -1     -1   
3    2          1         2     -1     -1     -2     -2     -2     -2   
4    2          1         1      0      0      0      0      0     -1   

   default  ... BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2   
0        1  ...         5         5         6         6        4        1  \
1        1  ...        10        10        10        10        8        8   
2        0  ...         4         4         4         4        7        8   
3        1  ...         1         1         1         1        2        1   
4        0  ...         5         6         3         4        4        6   

  PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6  
0        5        3        3        3  
1        8        

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


    Value   All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0       2  2516   547  1969    0.084154   0.296715 -1.260125  0.267854
1      -1  2260  1334   926    0.205231   0.139542  0.385770  0.025341
2      -2  1708   951   757    0.146308   0.114075  0.248858  0.008021
3       0  6380  3629  2751    0.558308   0.414557  0.297700  0.042795
4       3   159    21   138    0.003231   0.020796 -1.862024  0.032706
5       4    53     9    44    0.001385   0.006631 -1.566258  0.008216
6       7    23     1    22    0.000154   0.003315 -3.070335  0.009707
7       5    14     2    12    0.000308   0.001808 -1.771052  0.002658
8       6    18     4    14    0.000615   0.002110 -1.232056  0.001841
9       1     3     2     1    0.000308   0.000151  0.713854  0.000112
10      8     2     0     2    0.000000   0.000301      -inf       inf
IV score: inf

Calculating WoE and IV for PAY_3_merged
   Value   All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0      2  2516   547  1

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


   Value   All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  6743  3674  3069    0.565231   0.462477  0.200636  0.020616
1      2  1797   396  1401    0.060923   0.211121 -1.242820  0.186669
2     -1  2269  1294   975    0.199077   0.146926  0.303763  0.015842
3     -2  2088  1107   981    0.170308   0.147830  0.141544  0.003182
4      3   137    19   118    0.002923   0.017782 -1.805538  0.026828
5      4    38     7    31    0.001077   0.004671 -1.467370  0.005275
6      7    41     1    40    0.000154   0.006028 -3.668172  0.021546
7      6    15     1    14    0.000154   0.002110 -2.618350  0.005121
8      5     8     1     7    0.000154   0.001055 -1.925203  0.001735
IV score: 0.2868131107906914

Calculating WoE and IV for LIMIT_BAL
   Value   All  Good   Bad  Distr_Good  Distr_Bad       WoE        IV
0      1  1368   473   895    0.072769   0.134870 -0.617021  0.038318
1      7  1329   756   573    0.116308   0.086347  0.297863  0.008924
2      9  1277   788   

In [55]:
# Define WoE values for each categorical variable
print(cat_df.head())
woe_sex = {1: -0.102092, 2: 0.071482}
woe_age = {3: 0.102418, 2: 0.087342, 7: 0.071138, 9: -0.071038, 1: -0.249049, 6: 0.023524, 5: 0.145256, 8: -0.030432, 10: -0.149301, 4: 0.186996}


woe_education = {0: 0, 1: 0.178919, 2: -0.096400, 3: -0.160483, 4: 1.508784, 5: 1.378831, 6: 0.506215}
woe_marriage = {0: 0.713854, 1: -0.080641, 2: 0.076584, 3: -0.315765}
woe_pay_0_merged = {-2: 0.644372, -1: 0.303661, 0: 0.660552, 1: -0.542309, 2: -2.052248, 3: -2.643248, 4: -3.237389, 5: -1.445630, 6: -1.365587, 7:-1.483370, 8: -1.483370 }
woe_pay_2 = {
2: -1.453598,
-1: 0.375932,
0: 0.405314,
-2: 0.265893,
4: -1.493421,
3: -1.816862,
5: -1.994196,
6: -2.176517,
7: -1.077905,
1: 0.896176
}
woe_pay_3 = {
2: -1.260125,
-1: 0.38577,
-2: 0.248858,
0: 0.2977,
3: -1.862024,
4: -1.566258,
7: -3.157347,
8: -3.157347,
5: -1.771052,
6: -1.232056,
1: 0.713854
}
woe_pay_4 = {0: 0.244911,
2: -1.318902,
-2: 0.198638,
-1: 0.348769,
3: -1.735334,
6: -3.911118,
7: -3.911118,
8: -3.911118,
4: -1.323028,
5: -1.260227,
1: 0.020707
}
woe_pay_5 = {
    0: 0.205634,
    2: -1.443758,
    -1: 0.341287,
    -2: 0.179476,
    3: -1.487805,
    6: -3.930537,
    7: -3.930537,
    8: -3.930537,
    4: -1.272061,
    5: -1.588731
}
woe_pay_6 = {
    0: 0.200636,
    2: -1.242820,
    -1: 0.303763,
    -2: 0.141544,
    3: -1.805538,
    4: -1.467370,
    7: -3.668172,
    8: -3.668172,
    6: -2.618350,
    5: -1.925203
}

woe_limit_bal = {1: -0.617021, 7: 0.297863, 9: 0.497843, 6: 0.156923, 2: -0.409329, 5: -0.176410, 8: 0.395140, 10: 0.594256, 4: -0.171083, 3: -0.413051}

woe_bill_amt1 = {5: -0.173350, 10: 0.200797, 4: -0.046289, 3: 0.107586, 6: -0.159382, 8: 0.072469, 7: 0.016137, 9: 0.110638, 2: -0.015827, 1: -0.113436}

woe_bill_amt2 = {5: -0.170275, 10: 0.170148, 3: 0.113691, 1: -0.041683, 6: -0.202436, 8: 0.075515, 7: -0.029570, 9: 0.086229, 4: 0.029840, 2: -0.031625}

woe_bill_amt3 = {5: -0.148776, 10: 0.203866, 4: 0.048106, 1: -0.055375, 6: -0.110378, 8: 0.023751, 7: -0.118129, 2: -0.038301, 9: 0.092330, 3: 0.106125}

woe_bill_amt4 = {5: -0.069224, 10: 0.164027, 4: 0.112095, 1: -0.045984, 6: -0.104265, 7: -0.136508, 9: 0.061840, 3: 0.077168, 2: -0.078250, 8: 0.008530}

woe_bill_amt5 = {6: -0.134857, 10: 0.148736, 4: 0.231530, 1: -0.093064, 3: 0.150363, 7: -0.170275, 8: 0.002442, 9: 0.055745, 2: -0.057185, 5: -0.118129}

woe_bill_amt6 = {6: -0.128733, 10: 0.179334, 4: 0.279171, 1: -0.027393, 7: -0.219589, 8: -0.012781, 5: -0.138163, 9: 0.016137, 2: -0.022522, 3: 0.077168}

woe_pay_amt1 = {4: -0.063526, 8: 0.506830, 7: 0.373478, 2: -0.077791, 3: -0.003427, 6: -0.032295, 1: -0.650945, 5: 0.054745, 9: 0.652818}

woe_pay_amt2 = {1: -0.559548, 8: 0.401480, 6: 0.043557, 4: -0.081228, 5: 0.113742, 2: -0.130868, 7: 0.222300, 9: 0.783166, 3: -0.092890}

woe_pay_amt3 = {5: 0.013103, 8: 0.335167, 7: 0.255017, 1: -0.501926, 4: -0.079834, 9: 0.662910, 6: 0.089489, 2: -0.002874, 3: -0.094806}

woe_pay_amt4 = {3: -0.055159, 8: 0.398322, 7: 0.243007, 1: -0.462733, 5: -0.076004, 4: -0.089121, 9: 0.550248, 6: 0.198883, 2: 0.050486}

woe_pay_amt5 = {3: -0.147887, 1: -0.360932, 9: 0.609441, 7: 0.187611, 6: 0.109835, 4: -0.061490, 5: -0.090732, 2: 0.071351, 8: 0.329302}

woe_pay_amt6 = {3: -0.136782, 8: 0.339642, 9: 0.627045, 6: 0.095931, 2: 0.052747, 5: -0.080915, 1: -0.344668, 4: -0.085453, 7: 0.245573}

# Replace categorical variables with their WoE values
cat_df['EDUCATION'] = cat_df['EDUCATION'].map(woe_education)
cat_df['AGE'] = cat_df['AGE'].map(woe_age)
cat_df['MARRIAGE'] = cat_df['MARRIAGE'].map(woe_marriage)
cat_df['PAY_0'] = cat_df['PAY_0'].map(woe_pay_0_merged)
cat_df['PAY_2'] = cat_df['PAY_2'].map(woe_pay_2)
cat_df['PAY_3'] = cat_df['PAY_3'].map(woe_pay_3)
cat_df['PAY_4'] = cat_df['PAY_4'].map(woe_pay_4)
cat_df['PAY_5'] = cat_df['PAY_5'].map(woe_pay_5)
cat_df['PAY_6'] = cat_df['PAY_6'].map(woe_pay_6)
cat_df['LIMIT_BAL'] = cat_df['LIMIT_BAL'].map(woe_limit_bal)
cat_df['BILL_AMT1'] = cat_df['BILL_AMT1'].map(woe_bill_amt1)
cat_df['BILL_AMT2'] = cat_df['BILL_AMT2'].map(woe_bill_amt2)
cat_df['BILL_AMT3'] = cat_df['BILL_AMT3'].map(woe_bill_amt3)
cat_df['BILL_AMT4'] = cat_df['BILL_AMT4'].map(woe_bill_amt4)
cat_df['BILL_AMT5'] = cat_df['BILL_AMT5'].map(woe_bill_amt5)
cat_df['BILL_AMT6'] = cat_df['BILL_AMT6'].map(woe_bill_amt6)
cat_df['PAY_AMT1'] = cat_df['PAY_AMT1'].map(woe_pay_amt1)
cat_df['PAY_AMT2'] = cat_df['PAY_AMT2'].map(woe_pay_amt2)
cat_df['PAY_AMT3'] = cat_df['PAY_AMT3'].map(woe_pay_amt3)
cat_df['PAY_AMT4'] = cat_df['PAY_AMT4'].map(woe_pay_amt4)
cat_df['PAY_AMT5'] = cat_df['PAY_AMT5'].map(woe_pay_amt5)
cat_df['PAY_AMT6'] = cat_df['PAY_AMT6'].map(woe_pay_amt6)

print(cat_df.head())

final_data = cat_df[['EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'default','LIMIT_BAL','AGE','BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

# Split the data into training and testing sets
X = final_data.drop(['default'], axis=1)
y = final_data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


   SEX  EDUCATION  MARRIAGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6   
0    2          2         1      2      2      2      0      0      0  \
1    2          1         2      2      2      2      2      2      2   
2    2          1         2     -1     -1     -1      0     -1     -1   
3    2          1         2     -1     -1     -2     -2     -2     -2   
4    2          1         1      0      0      0      0      0     -1   

   default  ... PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 PAY_0_merged PAY_3_merged   
0        1  ...        5        3        3        3            2            2  \
1        1  ...        8        8        1        8            2            2   
2        0  ...        7        7        9        9           -1           -1   
3        1  ...        1        1        1        6           -1           -2   
4        0  ...        7        1        7        2            0            0   

  PAY_4_merged PAY_5_merged PAY_6_merged EDUCATION_merged  
0            0

In [56]:
log_reg = LogisticRegression(max_iter=100)
start_time = time.time()
log_reg.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
y_pred = log_reg.predict(X_test)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)


print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')  


Accuracy:  0.6940639269406392
Precision:  0.7509803921568627
Recall:  0.5820668693009119
F1 Score:  0.6558219178082192
AUC:  0.7629761402809697
  Elapsed Time: 0.08 seconds


In [57]:
grad_boost = GradientBoostingClassifier()
start_time = time.time()
grad_boost.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_grad_boost = grad_boost.predict(X_test)
y_pred_proba_grad_boost = grad_boost.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
accuracy_grad_boost = accuracy_score(y_test, y_pred_grad_boost)
precision_grad_boost = precision_score(y_test, y_pred_grad_boost)
recall_grad_boost = recall_score(y_test, y_pred_grad_boost)
f1_grad_boost = f1_score(y_test, y_pred_grad_boost)
auc_grad_boost = roc_auc_score(y_test, y_pred_proba_grad_boost)

print("Gradient Boosting accuracy: ", accuracy_grad_boost)
print("Gradient Boosting precision: ", precision_grad_boost)
print("Gradient Boosting recall: ", recall_grad_boost)
print("Gradient Boosting F1 Score: ", f1_grad_boost)
print("Gradient Boosting AUC: ", auc_grad_boost)
print(f"Gradient Boosting Elapsed Time: {elapsed_time:.2f} seconds")


Gradient Boosting accuracy:  0.7035768645357686
Gradient Boosting precision:  0.7470101195952162
Gradient Boosting recall:  0.6170212765957447
Gradient Boosting F1 Score:  0.6758218893050354
Gradient Boosting AUC:  0.7698289462710357
Gradient Boosting Elapsed Time: 1.00 seconds


In [58]:
# Convert the target variable to categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Build the deep learning model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time

# Predict on the test set
y_pred_dl = np.argmax(model.predict(X_test), axis=-1)

# Calculate evaluation metrics
accuracy_dl = accuracy_score(y_test, y_pred_dl)
precision_dl = precision_score(y_test, y_pred_dl)
recall_dl = recall_score(y_test, y_pred_dl)
f1_dl = f1_score(y_test, y_pred_dl)

# Calculate AUC
y_pred_proba_dl = model.predict(X_test)[:, 1]
auc_dl = roc_auc_score(y_test, y_pred_proba_dl)

print("Deep Learning accuracy: ", accuracy_dl)
print("Deep Learning precision: ", precision_dl)
print("Deep Learning recall: ", recall_dl)
print("Deep Learning F1 Score: ", f1_dl)
print("Deep Learning AUC: ", auc_dl)
print(f"Deep Learning Elapsed Time: {elapsed_time:.2f} seconds")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning accuracy:  0.6990106544901066
Deep Learning precision:  0.7333333333333333
Deep Learning recall:  0.6268996960486323
Deep Learning F1 Score:  0.6759524784924211
Deep Learning AUC:  0.7709623929683445
Deep Learning Elapsed Time: 17.33 seconds


In [59]:
svc = SVC(kernel='rbf', probability=True)
start_time = time.time()
svc.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_svc = svc.predict(X_test)

# Calculate evaluation metrics
accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)

# Calculate AUC
y_pred_proba_svc = svc.predict_proba(X_test)[:, 1]
auc_svc = roc_auc_score(y_test, y_pred_proba_svc)

print("Support Vector Machine accuracy: ", accuracy_svc)
print("Support Vector Machine precision: ", precision_svc)
print("Support Vector Machine recall: ", recall_svc)
print("Support Vector Machine F1 Score: ", f1_svc)
print("Support Vector Machine AUC: ", auc_svc)
print(f"Support Vector Machine Elapsed Time: {elapsed_time:.2f} seconds")


Support Vector Machine accuracy:  0.700152207001522
Support Vector Machine precision:  0.7608695652173914
Support Vector Machine recall:  0.5851063829787234
Support Vector Machine F1 Score:  0.661512027491409
Support Vector Machine AUC:  0.7582448546037512
Support Vector Machine Elapsed Time: 15.28 seconds


In [60]:
#Only IV higher that 0.01
final_data = cat_df[['EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'default','LIMIT_BAL','AGE','BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]



In [61]:
log_reg = LogisticRegression(max_iter=100)
start_time = time.time()
log_reg.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
y_pred = log_reg.predict(X_test)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)


print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')  


Accuracy:  0.6940639269406392
Precision:  0.7509803921568627
Recall:  0.5820668693009119
F1 Score:  0.6558219178082192
AUC:  0.7629761402809697
  Elapsed Time: 0.06 seconds


In [62]:
grad_boost = GradientBoostingClassifier()
start_time = time.time()
grad_boost.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_grad_boost = grad_boost.predict(X_test)
y_pred_proba_grad_boost = grad_boost.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
accuracy_grad_boost = accuracy_score(y_test, y_pred_grad_boost)
precision_grad_boost = precision_score(y_test, y_pred_grad_boost)
recall_grad_boost = recall_score(y_test, y_pred_grad_boost)
f1_grad_boost = f1_score(y_test, y_pred_grad_boost)
auc_grad_boost = roc_auc_score(y_test, y_pred_proba_grad_boost)

print("Gradient Boosting accuracy: ", accuracy_grad_boost)
print("Gradient Boosting precision: ", precision_grad_boost)
print("Gradient Boosting recall: ", recall_grad_boost)
print("Gradient Boosting F1 Score: ", f1_grad_boost)
print("Gradient Boosting AUC: ", auc_grad_boost)
print(f"Gradient Boosting Elapsed Time: {elapsed_time:.2f} seconds")


Gradient Boosting accuracy:  0.7035768645357686
Gradient Boosting precision:  0.7470101195952162
Gradient Boosting recall:  0.6170212765957447
Gradient Boosting F1 Score:  0.6758218893050354
Gradient Boosting AUC:  0.7698422673104011
Gradient Boosting Elapsed Time: 1.09 seconds


In [63]:
# Convert the target variable to categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Build the deep learning model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time

# Predict on the test set
y_pred_dl = np.argmax(model.predict(X_test), axis=-1)

# Calculate evaluation metrics
accuracy_dl = accuracy_score(y_test, y_pred_dl)
precision_dl = precision_score(y_test, y_pred_dl)
recall_dl = recall_score(y_test, y_pred_dl)
f1_dl = f1_score(y_test, y_pred_dl)

# Calculate AUC
y_pred_proba_dl = model.predict(X_test)[:, 1]
auc_dl = roc_auc_score(y_test, y_pred_proba_dl)

print("Deep Learning accuracy: ", accuracy_dl)
print("Deep Learning precision: ", precision_dl)
print("Deep Learning recall: ", recall_dl)
print("Deep Learning F1 Score: ", f1_dl)
print("Deep Learning AUC: ", auc_dl)
print(f"Deep Learning Elapsed Time: {elapsed_time:.2f} seconds")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning accuracy:  0.6921613394216134
Deep Learning precision:  0.7497536945812808
Deep Learning recall:  0.5782674772036475
Deep Learning F1 Score:  0.652938652938653
Deep Learning AUC:  0.7636775219623397
Deep Learning Elapsed Time: 17.13 seconds


In [64]:
svc = SVC(kernel='rbf', probability=True)
start_time = time.time()
svc.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_svc = svc.predict(X_test)

# Calculate evaluation metrics
accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)

# Calculate AUC
y_pred_proba_svc = svc.predict_proba(X_test)[:, 1]
auc_svc = roc_auc_score(y_test, y_pred_proba_svc)

print("Support Vector Machine accuracy: ", accuracy_svc)
print("Support Vector Machine precision: ", precision_svc)
print("Support Vector Machine recall: ", recall_svc)
print("Support Vector Machine F1 Score: ", f1_svc)
print("Support Vector Machine AUC: ", auc_svc)
print(f"Support Vector Machine Elapsed Time: {elapsed_time:.2f} seconds")


Support Vector Machine accuracy:  0.700152207001522
Support Vector Machine precision:  0.7608695652173914
Support Vector Machine recall:  0.5851063829787234
Support Vector Machine F1 Score:  0.661512027491409
Support Vector Machine AUC:  0.7582494880087479
Support Vector Machine Elapsed Time: 15.73 seconds


In [65]:
#Only IV higher that 0.1
final_data = cat_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'default','LIMIT_BAL','PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4']]



In [66]:
log_reg = LogisticRegression(max_iter=100)
start_time = time.time()
log_reg.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
y_pred = log_reg.predict(X_test)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)


print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print('AUC: ', auc)
print(f'  Elapsed Time: {elapsed_time:.2f} seconds')  


Accuracy:  0.6940639269406392
Precision:  0.7509803921568627
Recall:  0.5820668693009119
F1 Score:  0.6558219178082192
AUC:  0.7629761402809697
  Elapsed Time: 0.08 seconds


In [67]:
grad_boost = GradientBoostingClassifier()
start_time = time.time()
grad_boost.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_grad_boost = grad_boost.predict(X_test)
y_pred_proba_grad_boost = grad_boost.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
accuracy_grad_boost = accuracy_score(y_test, y_pred_grad_boost)
precision_grad_boost = precision_score(y_test, y_pred_grad_boost)
recall_grad_boost = recall_score(y_test, y_pred_grad_boost)
f1_grad_boost = f1_score(y_test, y_pred_grad_boost)
auc_grad_boost = roc_auc_score(y_test, y_pred_proba_grad_boost)

print("Gradient Boosting accuracy: ", accuracy_grad_boost)
print("Gradient Boosting precision: ", precision_grad_boost)
print("Gradient Boosting recall: ", recall_grad_boost)
print("Gradient Boosting F1 Score: ", f1_grad_boost)
print("Gradient Boosting AUC: ", auc_grad_boost)
print(f"Gradient Boosting Elapsed Time: {elapsed_time:.2f} seconds")


Gradient Boosting accuracy:  0.7035768645357686
Gradient Boosting precision:  0.7470101195952162
Gradient Boosting recall:  0.6170212765957447
Gradient Boosting F1 Score:  0.6758218893050354
Gradient Boosting AUC:  0.7698301046222848
Gradient Boosting Elapsed Time: 1.04 seconds


In [68]:
# Convert the target variable to categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

# Build the deep learning model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
start_time = time.time()
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, verbose=1)
elapsed_time = time.time() - start_time

# Predict on the test set
y_pred_dl = np.argmax(model.predict(X_test), axis=-1)

# Calculate evaluation metrics
accuracy_dl = accuracy_score(y_test, y_pred_dl)
precision_dl = precision_score(y_test, y_pred_dl)
recall_dl = recall_score(y_test, y_pred_dl)
f1_dl = f1_score(y_test, y_pred_dl)

# Calculate AUC
y_pred_proba_dl = model.predict(X_test)[:, 1]
auc_dl = roc_auc_score(y_test, y_pred_proba_dl)

print("Deep Learning accuracy: ", accuracy_dl)
print("Deep Learning precision: ", precision_dl)
print("Deep Learning recall: ", recall_dl)
print("Deep Learning F1 Score: ", f1_dl)
print("Deep Learning AUC: ", auc_dl)
print(f"Deep Learning Elapsed Time: {elapsed_time:.2f} seconds")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Deep Learning accuracy:  0.6993911719939118
Deep Learning precision:  0.7335701598579041
Deep Learning recall:  0.6276595744680851
Deep Learning F1 Score:  0.6764946764946764
Deep Learning AUC:  0.7715143473385722
Deep Learning Elapsed Time: 17.49 seconds


In [69]:
svc = SVC(kernel='rbf', probability=True)
start_time = time.time()
svc.fit(X_train, y_train)
elapsed_time = time.time() - start_time

y_pred_svc = svc.predict(X_test)

# Calculate evaluation metrics
accuracy_svc = accuracy_score(y_test, y_pred_svc)
precision_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
f1_svc = f1_score(y_test, y_pred_svc)

# Calculate AUC
y_pred_proba_svc = svc.predict_proba(X_test)[:, 1]
auc_svc = roc_auc_score(y_test, y_pred_proba_svc)

print("Support Vector Machine accuracy: ", accuracy_svc)
print("Support Vector Machine precision: ", precision_svc)
print("Support Vector Machine recall: ", recall_svc)
print("Support Vector Machine F1 Score: ", f1_svc)
print("Support Vector Machine AUC: ", auc_svc)
print(f"Support Vector Machine Elapsed Time: {elapsed_time:.2f} seconds")


Support Vector Machine accuracy:  0.700152207001522
Support Vector Machine precision:  0.7608695652173914
Support Vector Machine recall:  0.5851063829787234
Support Vector Machine F1 Score:  0.661512027491409
Support Vector Machine AUC:  0.7582448546037512
Support Vector Machine Elapsed Time: 14.74 seconds
