In [10]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Configure libraries
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('seaborn')

In [11]:
# Load dataset
df_bank = pd.read_csv('loan_data_set2.csv')

# print(df_bank.info())
print('Shape of dataframe:', df_bank.shape)
df_bank.head()

Shape of dataframe: (614, 12)


Unnamed: 0,Loa0_ID,Ge0der,Married,Depe0de0ts,Self_Emplo1ed,Applica0tI0come,Coapplica0tI0come,Loa0Amou0t,Loa0_Amou0t_Term,Credit_Histor1,Propert1_Area,Loa0_Status
0,LP001002,1.0,0.0,0.0,0.0,5849,0.0,,360.0,1.0,1,1
1,LP001003,1.0,1.0,1.0,0.0,4583,1508.0,128.0,360.0,1.0,3,0
2,LP001005,1.0,1.0,0.0,1.0,3000,0.0,66.0,360.0,1.0,1,1
3,LP001006,1.0,1.0,0.0,0.0,2583,2358.0,120.0,360.0,1.0,1,1
4,LP001008,1.0,0.0,0.0,0.0,6000,0.0,141.0,360.0,1.0,1,1


In [12]:
# class distribution
df_bank['Married'].value_counts()

1.0    398
0.0    213
Name: Married, dtype: int64

In [13]:
# handling missing values
df_bank.isnull().sum()

Loa0_ID               0
Ge0der               13
Married               3
Depe0de0ts           15
Self_Emplo1ed        32
Applica0tI0come       0
Coapplica0tI0come     0
Loa0Amou0t           22
Loa0_Amou0t_Term     14
Credit_Histor1       50
Propert1_Area         0
Loa0_Status           0
dtype: int64

In [14]:
# scaling numeric data
from sklearn.preprocessing import StandardScaler

# Copying original dataframe
df_bank_ready = df_bank.copy()

scaler = StandardScaler()
num_cols = ['Ge0der', 'Married', 'Depe0de0ts', 'Self_Emplo1ed', 'Applica0tI0come', 'Coapplica0tI0come','Loa0Amou0t']
df_bank_ready[num_cols] = scaler.fit_transform(df_bank[num_cols])

df_bank_ready.head()

Unnamed: 0,Loa0_ID,Ge0der,Married,Depe0de0ts,Self_Emplo1ed,Applica0tI0come,Coapplica0tI0come,Loa0Amou0t,Loa0_Amou0t_Term,Credit_Histor1,Propert1_Area,Loa0_Status
0,LP001002,0.47858,-1.366947,-0.752131,-0.404969,0.072991,-0.554487,,360.0,1.0,1,1
1,LP001003,0.47858,0.731557,0.233704,-0.404969,-0.134412,-0.038732,-0.215309,360.0,1.0,3,0
2,LP001005,0.47858,0.731557,-0.752131,2.469324,-0.393747,-0.554487,-0.940328,360.0,1.0,1,1
3,LP001006,0.47858,0.731557,-0.752131,-0.404969,-0.462062,0.25198,-0.30886,360.0,1.0,1,1
4,LP001008,0.47858,-1.366947,-0.752131,-0.404969,0.097728,-0.554487,-0.063289,360.0,1.0,1,1


In [16]:
# Encoding categorical data
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
cat_cols = ['Ge0der', 'Married', 'Depe0de0ts', 'Self_Emplo1ed', 'Applica0tI0come', 'Coapplica0tI0come','Loa0Amou0t']

# Encode Categorical Data
df_encoded = pd.DataFrame(encoder.fit_transform(df_bank_ready[cat_cols]))

# Replace Categotical Data with Encoded Data
df_bank_ready = df_bank_ready.drop(cat_cols ,axis=1)
df_bank_ready = pd.concat([df_encoded, df_bank_ready], axis=1)

# Encode target value
df_bank_ready['Loa0_Status'] = df_bank_ready['Loa0_Status'].apply(lambda x: 1 if x == 'yes' else 0)

print('Shape of dataframe:', df_bank_ready.shape)
df_bank_ready.head()

Shape of dataframe: (614, 1015)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1005,1006,1007,1008,1009,Loa0_ID,Loa0_Amou0t_Term,Credit_Histor1,Propert1_Area,Loa0_Status
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,LP001002,360.0,1.0,1,0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,LP001003,360.0,1.0,3,0
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,LP001005,360.0,1.0,1,0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,LP001006,360.0,1.0,1,0
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,LP001008,360.0,1.0,1,0


In [18]:
# Split dataset into training and testing

# Select Features
feature = df_bank_ready.drop('Loa0_Status', axis=1)

# Select Target
target = df_bank_ready['Loa0_Status']

# Set Training and Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature , target, shuffle = True, test_size=0.2, random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

Shape of training feature: (491, 1014)
Shape of testing feature: (123, 1014)
Shape of training label: (491,)
Shape of training label: (123,)


In [19]:
# Modelling

def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

In [20]:
from sklearn import tree

# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)

# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Cohens Kappa Score:', dtc_eval['kappa'])
print('Area Under Curve:', dtc_eval['auc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [21]:
from sklearn.ensemble import RandomForestClassifier

# Building Random Forest model 
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Evaluate Model
rf_eval = evaluate_model(rf, X_test, y_test)

# Print result
print('Accuracy:', rf_eval['acc'])
print('Precision:', rf_eval['prec'])
print('Recall:', rf_eval['rec'])
print('F1 Score:', rf_eval['f1'])
print('Cohens Kappa Score:', rf_eval['kappa'])
print('Area Under Curve:', rf_eval['auc'])
print('Confusion Matrix:\n', rf_eval['cm'])

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.