In [85]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score # to split the data
from sklearn.metrics import f1_score, accuracy_score, log_loss, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

# Algorithmns models to be compared
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [23]:
import numpy as np
import pandas as pd 
#Importing the data
df_credit = pd.read_csv("data/german_credit_data.csv", index_col=0)


In [25]:
df_credit['Job'] = df_credit['Job'].replace([0,1,2,3],['unskilled/non-res','unskilled/res','skilled','highlyskilled'])

In [61]:
X = df_credit.drop('Risk', axis = 1)
y = LabelEncoder().fit_transform(df_credit['Risk'])

In [54]:
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,male,skilled,own,,little,1169,6,radio/TV
1,22,female,skilled,own,little,moderate,5951,48,radio/TV
2,49,male,unskilled/res,own,little,,2096,12,education
3,45,male,skilled,free,little,little,7882,42,furniture/equipment
4,53,male,skilled,free,little,little,4870,24,car


In [48]:
num_features=X.select_dtypes(exclude='O').columns
cat_features=X.select_dtypes(include='O').columns

In [50]:
num_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                                    ('scaler', StandardScaler())])
cat_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
                                    ('encoder', OneHotEncoder())])                       

preprocessor = ColumnTransformer(
    [
        ("catigorical", cat_transformer, cat_features),
        ('numerical', num_transformer, num_features)
    ]
)

In [63]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 9), (200, 9))

In [92]:
# def model_evaluation(true, predicted):
#     accuracy=accuracy_score(true, predicted)
#     f1score=f1_score(true, predicted)
#     confusionmatrix=confusion_matrix(true,predicted)
    
#     return accuracy,f1_score,confusion_matrix

In [66]:
classifiers = {
    'LogisticRegression':LogisticRegression(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(gamma='auto'),
    'XGBClassifier':XGBClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier()   
    }

In [109]:
classifier_list=list(classifiers.keys())
accuracy_score_list=[]
f1_score_list=[]

In [111]:
for i in range(len(classifiers)):
    classifier = list(classifiers.values())[i]
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)
    y_train_pred=pipe.predict(X_train)   
    y_test_pred=pipe.predict(X_test)  

    # Evaluate Train and Test dataset
    print('{} performanced on training set'.format(classifier_list[i]))
    train_accuracy_score=accuracy_score(y_train, y_train_pred)
    print('train_accuracy_Score: {}'.format(train_accuracy_score))
    train_f1_score=f1_score(y_train, y_train_pred)
    print('train_F1_Score: {}'.format(train_f1_score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    test_accuracy_score=accuracy_score(y_test, y_test_pred)
    print('test_accuracy_Score: {}'.format(test_accuracy_score))
    test_f1_score=f1_score(y_test, y_test_pred)
    print('test_F1_Score: {}'.format(test_f1_score))
    conf_matrix=confusion_matrix(y_test, y_test_pred)
    print(conf_matrix)
    accuracy_score_list.append(test_accuracy_score)
    f1_score_list.append(test_f1_score)
    
    print('='*35)
    print('\n')


LogisticRegression performanced on training set
train_accuracy_Score: 0.74125
train_F1_Score: 0.8285004142502072
----------------------------------
Model performance for Test set
test_accuracy_Score: 0.76
test_F1_Score: 0.8421052631578947
[[ 24  35]
 [ 13 128]]


KNeighborsClassifier performanced on training set
train_accuracy_Score: 0.78625
train_F1_Score: 0.8561816652649286
----------------------------------
Model performance for Test set
test_accuracy_Score: 0.735
test_F1_Score: 0.8295819935691319
[[ 18  41]
 [ 12 129]]


DecisionTreeClassifier performanced on training set
train_accuracy_Score: 1.0
train_F1_Score: 1.0
----------------------------------
Model performance for Test set
test_accuracy_Score: 0.665
test_F1_Score: 0.7490636704119851
[[ 33  26]
 [ 41 100]]


RandomForestClassifier performanced on training set
train_accuracy_Score: 1.0
train_F1_Score: 1.0
----------------------------------
Model performance for Test set
test_accuracy_Score: 0.755
test_F1_Score: 0.83934426229



GradientBoostingClassifier performanced on training set
train_accuracy_Score: 0.875
train_F1_Score: 0.9156829679595279
----------------------------------
Model performance for Test set
test_accuracy_Score: 0.76
test_F1_Score: 0.8431372549019608
[[ 23  36]
 [ 12 129]]




In [113]:
pd.DataFrame(list(zip(classifier_list, accuracy_score_list, f1_score_list)), columns=['Model Name', 'Accuracy_Score','F1_Score']).sort_values(by=["Accuracy_Score","F1_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score,F1_Score
7,GradientBoostingClassifier,0.76,0.843137
0,LogisticRegression,0.76,0.842105
3,RandomForestClassifier,0.755,0.839344
4,SVC,0.75,0.847561
5,XGBClassifier,0.745,0.828283
1,KNeighborsClassifier,0.735,0.829582
6,AdaBoostClassifier,0.735,0.828479
2,DecisionTreeClassifier,0.665,0.749064
