# **This notebook contains code to classify a whether a candidate is a credit risk or not?**
I have implemented following algorithms using grid search with Cross validation to find optimal solution:

1. SVM
2. Decision Tree with Pruning
3. K nearest neighbor
4. AdaBoost
5. Neural Net using MLP Classifier
6. Custom Neural Net


In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import re
import seaborn
import time
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# Plots and stats
import matplotlib.pyplot as plt
import seaborn as sns

# Label Encoding & Scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder

# Model Building
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,precision_score
from sklearn.pipeline import Pipeline

# Models
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold,train_test_split,cross_val_score,cross_val_predict
from keras.models import Sequential
from keras.layers import Dense

#Oversampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import itertools

In [None]:
np.random.seed(42)

In [None]:
df2 = pd.read_csv('/kaggle/input/credit-risk-customers')

In [None]:
df2.head()

In [None]:
df2['gender'] = 'Unknown'
gender_pattern = re.compile(r'(male|female)', flags=re.IGNORECASE)

for index, row in df2.iterrows():
    match = gender_pattern.search(row['personal_status'])
    if match:
        df2.at[index, 'gender'] = match.group()
        row['personal_status'] = gender_pattern.sub('', row['personal_status'])

def drop_words(s, words):
    for word in words:
        s = s.replace(word, '')
    return s.strip()

words_to_drop = ['male', 'female', 'fe']
df2['personal_status'] = df2['personal_status'].apply(lambda x: drop_words(x, words_to_drop))

In [None]:
class_map = {'bad': 0, 'good': 1}
df2['class'] = df2['class'].map(class_map)

In [None]:
def object_to_categorical(df):
    object_cols = df.select_dtypes(include='object').columns
    for col in object_cols:
        df[col] = df[col].astype('category')
    return df
df = object_to_categorical(df2)

In [None]:
for column in df.columns:
    unique_values = df[column].nunique()
    print(f"{column}: {unique_values}")

In [None]:
df.isnull().sum()

In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64'])

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[~((df[numeric_columns.columns] < lower_bound) | (df[numeric_columns.columns] > upper_bound)).any(axis=1)]

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation before feature Engg')
plt.savefig('Correlation before feature Engg')
plt.show()

# FEATURE ENGINEERING

In [None]:
df['debt_to_income_ratio'] = df['credit_amount'] / (df['duration'] * df['installment_commitment'])

bins = [0, 20, 30, 40, 50, 60, 70, 120]
labels = ['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '70+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, include_lowest=True)
del df['age']
df['credit_utilization'] = df['credit_amount'] / df['existing_credits']

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation after feature Engg')
plt.savefig('Correlation_after_feature_engg')
plt.show()

In [None]:
del df['num_dependents']
del df['foreign_worker']

In [None]:
scaler = MinMaxScaler()
num_cols = ['duration', 'credit_amount', 'installment_commitment', 'residence_since',
            'debt_to_income_ratio' ,'credit_utilization', 'existing_credits' ]
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
cat_cols = ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status',
            'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone',
            'gender', 'age_group']
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
smote = SMOTE()
X = df.drop('class', axis=1)
y = df['class']
X, y = smote.fit_resample(X, y)
df = pd.concat([X, y], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.2, random_state=42)

In [None]:
# ros = RandomOverSampler()
# X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
train_results = []
test_results = []

In [None]:
execution_times = []
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    train_acc = accuracy_score(y_train, train_preds)
    test_acc = accuracy_score(y_test, test_preds)

    end_time = time.time()
    execution_time = end_time - start_time

    execution_times.append(execution_time)

    return train_acc, test_acc, test_preds

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

GRID SEARCH

In [None]:
# decision tree
import pandas as pd
param_grid = {
    'max_depth': [2,3, 5,7,9,10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

results = results[['param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'param_criterion', 'mean_test_score']]

results.rename(columns={
    'param_max_depth': 'Max Depth',
    'param_min_samples_split': 'Min Samples Split',
    'param_min_samples_leaf': 'Min Samples Leaf',
    'param_criterion': 'Criterion',
    'mean_test_score': 'Mean Test Score'
}, inplace=True)

results.sort_values(by='Mean Test Score', ascending=False, inplace=True)

results.to_csv('grid_search_results_2.csv', index=False)

print("Best Hyperparameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')


In [None]:
# KNN
import pandas as pd
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
}

clf = KNeighborsClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

results = results[['param_n_neighbors', 'param_weights', 'param_p', 'mean_test_score']]

results.rename(columns={
    'param_n_neighbors': 'Number of Neighbors',
    'param_weights': 'Weights',
    'param_p': 'p',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)

results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)

results.to_csv('knn_grid_search_results_2.csv', index=False)

print("Best Hyperparameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')


In [None]:
# SVM
param_grid = {
    # 'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
}

clf = SVC(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results = results[['param_kernel', 'param_gamma', 'mean_test_score']]

results.rename(columns={
    'param_kernel': 'Kernel',
    'param_gamma': 'Gamma',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)

results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)
results.to_csv('svm_grid_search_results_2.csv', index=False)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')
print("Best Hyperparameters:")
print(grid_search.best_params_)

In [None]:
# ADABOOST
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1],
    'base_estimator__max_depth': [1, 2, 3],
}

base_classifier = DecisionTreeClassifier(random_state=42)
clf = AdaBoostClassifier(base_classifier)
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results = results[['param_n_estimators', 'param_learning_rate', 'param_base_estimator__max_depth', 'mean_test_score']]
results.rename(columns={
    'param_n_estimators': 'Number of Estimators',
    'param_learning_rate': 'Learning Rate',
    'param_base_estimator__max_depth': 'Base Estimator Max Depth',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)

results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)

results.to_csv('adaboost_grid_search_results_2.csv', index=False)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')
print("Best Hyperparameters:")
print(grid_search.best_params_)


In [None]:
# MLP Classifier
param_grid = {
    'hidden_layer_sizes': [(5,), (10,), (15,), (20,), (30,), (40,), (50,), (60,), (80,)],
    'learning_rate_init': [0.0001, 0.001, 0.01],
}

clf = MLPClassifier(random_state=42)

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results = results[['param_hidden_layer_sizes', 'param_learning_rate_init', 'mean_test_score']]
results.rename(columns={
    'param_hidden_layer_sizes': 'Hidden Layer Sizes',
    'param_learning_rate_init': 'Learning Rate Init',
    'mean_test_score': 'Mean Test Score (Accuracy)'
}, inplace=True)
results.sort_values(by='Mean Test Score (Accuracy)', ascending=False, inplace=True)

results.to_csv('mlp_classifier_grid_search_results_2.csv', index=False)
best_model = grid_search.best_estimator_

print("Best Hyperparameters:")
print(grid_search.best_params_)

y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test)
print(f'The accuracy of the best model is: {accuracy}')

APPLY RESULT TO MODELS

In [None]:
# SVM - Support Vector Machines (SVMs)

# model_1 = SVC(gamma = 'scale', kernel = 'rbf')
model_1 = SVC(gamma = 0.1, kernel = 'rbf', random_state=42)
train_acc, test_acc, y_pred = evaluate_model(model_1, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_SVM=accuracy_score(y_test,y_pred)
f1_SVM=f1_score(y_test,y_pred)
clf_SVM=classification_report(y_test,y_pred)

print('***********SVM***********')
print('\n')
print('Accuracy : ',acc_SVM)
print('F1 Score : ',f1_SVM)
print(10*'=====')
print('Confusion Matrix :\n',cm)
print(10*'=====')
print('Classification Report :\n',clf_SVM)
print(30*'========')

plot_confusion_matrix(cm, classes=['Good', 'Bad'], title=model_1.__class__.__name__)
plt.savefig('confusion_matrix_svm_2.png')
plt.show()


In [None]:
#KNN
knn_params = {
    'n_neighbors': 3,
    'p': 1,
    'weights': 'distance'
}

model_2 = KNeighborsClassifier(**knn_params)
train_acc, test_acc, y_pred = evaluate_model(model_2, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_knn = accuracy_score(y_test, y_pred)
f1_knn = f1_score(y_test, y_pred)
clf_knn = classification_report(y_test, y_pred)

print('***********KNN with Single Set of Parameters***********')
print('\n')
print('Accuracy : ', acc_knn)
print('F1 Score : ', f1_knn)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_knn)
print(30 * '========')

plot_confusion_matrix(cm, classes=['Good', 'Bad'], title=model_2.__class__.__name__)
plt.title(f'Confusion Matrix for KNN with Single Set of Parameters')
plt.savefig('confusion_matrix_knn_2.png')
plt.show()

In [None]:
# Decision tree
# tree_params = {
#     'criterion': 'gini',
#     'max_depth': 10,
#     'min_samples_leaf': 2,
#     'min_samples_split': 2,
#     'ccp_alpha': 0.01
# }
tree_params = {
    'criterion': 'entropy',
    'max_depth': 15,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'ccp_alpha': 0.01
}
model_3 = DecisionTreeClassifier(**tree_params, random_state=42)
train_acc, test_acc, y_pred = evaluate_model(model_3, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_dt = accuracy_score(y_test, y_pred)
f1_dt = f1_score(y_test, y_pred)
clf_dt = classification_report(y_test, y_pred)

print('***********Decision Tree with Pruned Parameters and Early Stopping***********')
print('\n')
print('Accuracy : ', acc_dt)
print('F1 Score : ', f1_dt)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_dt)
print(30 * '========')

plot_confusion_matrix(cm, classes=['Good', 'Bad'], title=model_3.__class__.__name__)
plt.title(f'Confusion Matrix for Pruned Parameters and Early Stopping')
plt.savefig('confusion_matrix_pruned_early_stopping_2.png')
plt.show()


In [None]:
# Boosting with AdaBoost
base_classifier = DecisionTreeClassifier(max_depth=3)

model_4 = AdaBoostClassifier(
    base_classifier,
    n_estimators=100,
    learning_rate=0.1, # 0.1
    random_state=42
)

train_acc, test_acc, y_pred = evaluate_model(model_4, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_b = accuracy_score(y_test, y_pred)
f1_b = f1_score(y_test, y_pred)
clf_b = classification_report(y_test, y_pred)

print('***********AdaBoost***********')
print('\n')
print('Accuracy : ', acc_b)
print('F1 Score : ', f1_b)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_b)
print(30 * '========')

plot_confusion_matrix(cm, classes=['Good', 'Bad'], title=model_4.__class__.__name__)
plt.savefig('confusion_matrix AdaBoost 2.png')
plt.show()

In [None]:
# Neural Network
model_5 = MLPClassifier(hidden_layer_sizes =  (60,),learning_rate_init = 0.01, random_state=42, alpha=0.001)
# model_5 = MLPClassifier(hidden_layer_sizes =  (40,),learning_rate_init = 0.001)

train_acc, test_acc, y_pred = evaluate_model(model_5, X_train, y_train, X_test, y_test)
train_results.append(train_acc)
test_results.append(test_acc)
cm = confusion_matrix(y_test, y_pred)
acc_nn=accuracy_score(y_test,y_pred)
f1_nn=f1_score(y_test,y_pred)
clf_nn=classification_report(y_test,y_pred)

print('***********Neural Network***********')
print('\n')
print('Accuracy : ',acc_nn)
print('F1 Score : ',f1_nn)
print(10*'=====')
print('Confusion Matrix :\n',cm)
print(10*'=====')
print('Classification Report :\n',clf_nn)
print(30*'========')

plot_confusion_matrix(cm, classes=['Good', 'Bad'], title=model_5.__class__.__name__)
plt.savefig('confusion_matrix_mlp2.png')
plt.show()

In [None]:
# Neural Network
class NeuralNetwork(tf.keras.Model):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(100, activation='relu') #80
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        return self.dense2(x)

model = NeuralNetwork()
learning_rate = 0.01
custom_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

start_time = time.time()
history = model.fit(X_train, y_train, epochs=20, validation_split=0.2)

loss, accuracy = model.evaluate(X_test, y_test)

end_time = time.time()
execution_time = end_time - start_time
execution_times.append(execution_time)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss', color='b')
plt.plot(history.history['val_loss'], label='Validation Loss', color='r')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy', color='b')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='r')
plt.axhline(y=accuracy, color='g', linestyle='-', label='Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

final_val_loss = history.history['val_loss'][-1]
final_val_accuracy = history.history['val_accuracy'][-1]

print("Validation Loss:", final_val_loss)
print("Validation Accuracy:", final_val_accuracy)

train_results.append(final_val_accuracy)
test_results.append(accuracy)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

y_pred = model.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred_binary)
acc_Cnn = accuracy_score(y_test, y_pred_binary)
f1_Cnn = f1_score(y_test, y_pred_binary)
clf_Cnn = classification_report(y_test, y_pred_binary)

print('***********Custom Neural Network ***********')
print('\n')
print('Accuracy : ', acc_Cnn)
print('F1 Score : ', f1_Cnn)
print(10 * '=====')
print('Confusion Matrix :\n', cm)
print(10 * '=====')
print('Classification Report :\n', clf_Cnn)
print(30 * '========')

plot_confusion_matrix(cm, classes=['Good', 'Bad'], title=model.__class__.__name__)
plt.savefig('Custom NN confusion_matrix2.png')
plt.show()


MODEL COMPARISION


In [None]:
model_names = ["SVM","KNN", "Decision Trees","AdaBoost","MLPClassifier","Custom Neural_Network"]

plt.figure(figsize=(10,5))
plt.plot(model_names, train_results, 'o-', label="Training Accuracy", color='b')
plt.plot(model_names, test_results, 'o-', label="Testing Accuracy", color='r')
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.legend()
plt.title("Model Comparisons - Accuracy")
plt.savefig("Model Comparisons - Accuracy2")
plt.show()

In [None]:
model_names = [ "SVM","KNN_1", "Decision Trees","AdaBoost","MLP Classifier","Custom Neural_Network"]
plt.figure(figsize=(10,5))
plt.plot(model_names, execution_times, 'o-', label="Wall Clock Time", color='b')
plt.ylabel("Time")
plt.xticks(rotation=45)
plt.legend()
plt.title("Model Comparisons - Wall clock time")
plt.savefig("Model Comparisons - Wall clock time2")
plt.show()

In [None]:
tbl=pd.DataFrame()
tbl['Model']=pd.Series(['SVM','Decision Tree','KNN','AdaBoost', 'MLP Classifier','Custom Neural Network'])
tbl['Accuracy']=pd.Series([acc_SVM,acc_dt,acc_knn,acc_b, acc_nn, acc_Cnn])
tbl['F1_Score']=pd.Series([f1_SVM,f1_dt,f1_knn,f1_b, f1_nn, f1_Cnn])
tbl.set_index('Model')