In [2]:
import helper_cleaning
# Data Visualization
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning
from sklearn.svm import  SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from custom_confusion_matrix import make_confusion_matrix
from scipy.stats import ks_2samp, kstest
# SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.inspection import permutation_importance
from imblearn.under_sampling import OneSidedSelection
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.metrics import geometric_mean_score
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict
from scipy.stats.mstats import gmean
import math
import warnings
warnings.filterwarnings('ignore') 

In [3]:
cols = ["Status", "Duration_in_month", "Credit_history", "Purpose",
        "Credit_amount", "Savings_account", "Present_employment_since", "Installment_rate",
        "Personal_status", "Other_debtors", "Present_residence", "Property",
        "Age", "Other_installment", "Housing", "Number_of_existing_credits",
        "Job", "Number_of_people", "Telephone", "foreign_worker", "pred"]
len(cols)

21

In [4]:
dataframe = pd.read_csv("german.data", delimiter=' ', header=None, names=cols)

In [5]:
dataframe.head()

Unnamed: 0,Status,Duration_in_month,Credit_history,Purpose,Credit_amount,Savings_account,Present_employment_since,Installment_rate,Personal_status,Other_debtors,...,Property,Age,Other_installment,Housing,Number_of_existing_credits,Job,Number_of_people,Telephone,foreign_worker,pred
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [6]:
dataframe["pred"] = dataframe["pred"].map({1: 0, 2:1})

In [7]:
X = dataframe.drop(["pred"], axis=1)
y = dataframe["pred"]

In [8]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print(np.bincount(y_over))
undersample=RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)
print(np.bincount(y_under))
numeric_cols = dataframe.select_dtypes(include='number')
X = numeric_cols.drop(["pred"], axis=1)
y = numeric_cols["pred"]
# Apply One-Sided Selection
oss = OneSidedSelection(random_state=42)
oss.fit(X, y)
X_oss, y_oss = oss.fit_resample(X, y)
print(np.bincount(y_oss))
X_SMOTE, y_SMOTE = SMOTE(k_neighbors=5).fit_resample(X, y)
print(np.bincount(y_SMOTE))
X_ADASYN, y_ADASYN = ADASYN(n_neighbors=5).fit_resample(X, y)
print(np.bincount(y_ADASYN))
list_of_X=[X_over,X_under,X_oss,X_SMOTE,X_ADASYN]
list_of_y=[y_over,y_under,y_oss,y_SMOTE,y_ADASYN]
names = ["over","under","oss","smote","adasyn"]
resample_dic={}
for i in names:
    for X,y in zip(list_of_X,list_of_y):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y,
                                                    shuffle=True)
        numerical_features = X_train.select_dtypes(include='number').columns.tolist()
        categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
        numeric_pipeline = Pipeline(steps=[('Scale', MinMaxScaler())])
        categorical_pipeline = Pipeline(steps=[('One-Hot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
        full_processor = ColumnTransformer(transformers=[('number', numeric_pipeline, numerical_features),('category', categorical_pipeline, categorical_features)])
        X_train_transformed = full_processor.fit_transform(X_train)
        X_test_transformed = full_processor.transform(X_test)
        corr = spearmanr(X_train_transformed).correlation
        corr_linkage = hierarchy.ward(corr)
        cluster_ids = hierarchy.fcluster(corr_linkage,2, criterion='distance')
        cluster_id_to_feature_ids = defaultdict(list)
        for idx, cluster_id in enumerate(cluster_ids):
            cluster_id_to_feature_ids[cluster_id].append(idx)
        selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
        X_train_sel = X_train_transformed[:, selected_features]
        X_test_sel = X_test_transformed[:, selected_features]
        resample_dic[f"X_train_transformed-{i}"] = X_train_sel
        resample_dic[f"X_test_transformed-{i}"] = X_test_sel
        resample_dic[f"y_train-{i}"] = y_train
        resample_dic[f"y_test-{i}"] = y_test
        
        
        
        
    

[700 700]
[300 300]
[587 300]
[700 700]
[700 712]


## Train Classifiers

In [9]:
svm_clf = SVC(C=1.0, kernel='poly', tol=0.001)
lr_clf = LogisticRegression(random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
nb_clf = GaussianNB()
knn_clf = KNeighborsClassifier()
qda_clf = QuadraticDiscriminantAnalysis()
lda_clf = LinearDiscriminantAnalysis()
ann_clf = MLPClassifier(solver='adam', hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000)
list_of_classifiers = [svm_clf,lr_clf,dt_clf,rf_clf,nb_clf,knn_clf,qda_clf,lda_clf,ann_clf]

In [10]:
name_of_classifiers = ["SVC", "LogisticRegression","RandomForestClassifier","DecisionTreeClassifier", "GaussianNB", "QuadraticDiscriminantAnalysis","LinearDiscriminantAnalysis", "MLPClassifier", "KNeighborsClassifier"]
y_pred_dic = {}
for i in names:
    for pred,cname in zip(list_of_classifiers,name_of_classifiers):
        #print(i)
        pred.fit(resample_dic.get(f"X_train_transformed-{i}"),resample_dic.get(f"y_train-{i}"))
        y_pred = pred.predict(resample_dic.get(f"X_test_transformed-{i}"))
        y_pred_dic[f"y_pred_{i}_{cname}"] = y_pred
        
        

In [21]:
cm_dic = {}
for k,cname in zip(y_pred_dic.keys(),name_of_classifiers):
    #print(k)
    for i in names:
        #print(i)
        cm = confusion_matrix(resample_dic.get(f"y_test-{i}"), y_pred_dic[k])
        cm_dic[f"cm_{i}_{cname}"] = cm
        
#print(cm_dic) 

cm2=[]
gmean_dic = {}
new_gmean = {}
for k in cm_dic.keys():
    #print(cm_dic)
    cm2 = cm_dic[k]
    TP = cm2[0][0]
    FN = cm2[1][0]
    FP = cm2[0][1]
    TN = cm2[1][1]
    TPR=TP/(TP+FN)
    TNR=TN/(TN+FP)
    AUC=(TPR+TNR)/2
    precision = TP/(TP+FP)
    arr1 = gmean([TPR,TNR])
    gmean_dic[f"gmean_{k}"] =arr1
    #(1+0.05*(TPR-TNR))*math.sqrt(TPR*TNR)

key_list = list(gmean_dic.keys())
value_list=list(gmean_dic.values())
print('classification    ',*names)
print()
svm_key_list=key_list[0:5]
svm_value_list=value_list[0:5]
#print(svm_value_list)
#print(svm_key_list)
print(gmean_dic.values())


classification     over under oss smote adasyn

dict_values([0.5929342992932694, 0.5929342992932694, 0.5929342992932694, 0.5929342992932694, 0.5929342992932694, 0.5456458150327773, 0.5456458150327773, 0.5456458150327773, 0.5456458150327773, 0.5456458150327773, 0.6456517757373194, 0.6456517757373194, 0.6456517757373194, 0.6456517757373194, 0.6456517757373194, 0.6266827743827366, 0.6266827743827366, 0.6266827743827366, 0.6266827743827366, 0.6266827743827366, 0.5471216549024156, 0.5471216549024156, 0.5471216549024156, 0.5471216549024156, 0.5471216549024156, 0.6165578357761262, 0.6165578357761262, 0.6165578357761262, 0.6165578357761262, 0.6165578357761262, 0.5471216549024156, 0.5471216549024156, 0.5471216549024156, 0.5471216549024156, 0.5471216549024156, 0.5386873388894104, 0.5386873388894104, 0.5386873388894104, 0.5386873388894104, 0.5386873388894104, 0.5456458150327773, 0.5456458150327773, 0.5456458150327773, 0.5456458150327773, 0.5456458150327773])
