In [None]:
import helper_cleaning
# Data Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from custom_confusion_matrix import make_confusion_matrix
from scipy.stats import ks_2samp, kstest
# SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore') 

In [None]:
cols = ["Status", "Duration_in_month", "Credit_history", "Purpose",
        "Credit_amount", "Savings_account", "Present_employment_since", "Installment_rate",
        "Personal_status", "Other_debtors", "Present_residence", "Property",
        "Age", "Other_installment", "Housing", "Number_of_existing_credits",
        "Job", "Number_of_people", "Telephone", "foreign_worker", "pred"]
len(cols)

In [None]:
dataframe = pd.read_csv("german.data", delimiter=' ', header=None, names=cols)

In [None]:
dataframe.head()

In [None]:
dataframe["pred"] = dataframe["pred"].map({1: 0, 2:1})

In [None]:
X = dataframe.drop(["pred"], axis=1)
y = dataframe["pred"]

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')

X_over, y_over = oversample.fit_resample(X, y)
print(np.bincount(y_over))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over,
                                                    y_over,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y_over,
                                                    shuffle=True)

In [None]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [None]:
numeric_pipeline = Pipeline(steps=[
    ('Scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('One-Hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [None]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [None]:
X_train_transformed = full_processor.fit_transform(X_train)
X_test_transformed = full_processor.transform(X_test)

## Train Classifiers

In [None]:
svm_clf = SVC(random_state=42, probability=True)
lr_clf = LogisticRegression(random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
nb_clf = GaussianNB()
knn_clf = KNeighborsClassifier()
qda_clf = QuadraticDiscriminantAnalysis()
lda_clf = LinearDiscriminantAnalysis()
ann_clf = MLPClassifier(solver='adam', hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000)

In [None]:
list_of_classifiers = [svm_clf, lr_clf, rf_clf, dt_clf, nb_clf, qda_clf, lda_clf, ann_clf, knn_clf]
name_of_classifiers = ["SVC", "LogisticRegression","RandomForestClassifier",
                       "DecisionTreeClassifier", "GaussianNB", "QuadraticDiscriminantAnalysis",
                       "LinearDiscriminantAnalysis", "MLPClassifier", "KNeighborsClassifier"]

In [None]:
results_mean = {}
results_std = {}
for classifier, name in zip(list_of_classifiers, name_of_classifiers):
    scores = cross_val_score(classifier, X_train_transformed, y_train)
    results_mean[name] = np.round(scores.mean(), decimals=2)
    results_std[name] = np.round(scores.std(), decimals=2)

In [None]:
plt.figure(figsize=(12, 8))
plt.bar(results_mean.keys(), results_mean.values(), yerr=results_std.values(), color="gray")
xlocs, xlabs = plt.xticks()
for i, v in enumerate(pd.Series(results_mean)):
    plt.text(xlocs[i], v - 0.05, str(v), ha = 'center', color="white")
plt.xticks(rotation=90)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Accuracy Score")
plt.ylim(0.5, 0.9)
plt.tight_layout()
plt.show()

In [None]:
auc_scores = []
ks = []
for classifier, name in zip(list_of_classifiers, name_of_classifiers):
    clf = classifier.fit(X_train_transformed, y_train)
    ks_score = kstest(y_test, clf.predict_proba(X_test_transformed)[:, 1])
    auc_scores.append(roc_auc_score(y_test, clf.predict_proba(X_test_transformed)[:, 1]))
    ks.append(ks_score.statistic)

In [None]:
plt.figure(figsize=(12, 8))
plt.bar(results_mean.keys(), auc_scores, color="lightcoral")
xlocs, xlabs = plt.xticks()
for i, v in enumerate(np.round(auc_scores, decimals=2)):
    plt.text(xlocs[i], v - 0.03, str(v), ha = 'center', color="white")
plt.xticks(rotation=90)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("AUC Score")
plt.ylim(0.5, 1)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
plt.bar(results_mean.keys(), ks, color="lightblue")
xlocs, xlabs = plt.xticks()
for i, v in enumerate(np.round(ks, decimals=3)):
    plt.text(xlocs[i], v - 0.03, str(v), ha = 'center', color="black")
plt.xticks(rotation=90)
plt.xlabel("Models")
plt.ylabel("KS Accuracy")
plt.title("KS Score")
plt.tight_layout()
plt.show()

In [None]:
from collections import defaultdict

from scipy.stats import spearmanr
from scipy.cluster import hierarchy

In [None]:
feature_names = helper_cleaning.get_feature_names_mixture_column_transformer(
    full_processor,
    categorical_features,
    "number",
    "category")
importances = rf_clf.feature_importances_

In [None]:
forest_importances = pd.Series(importances, index=feature_names)
forest_importances.sort_values(ascending=False, inplace=True)
importance_mask = np.where(forest_importances > .07, 1, 0)

In [None]:
color_important = ["lightcoral" if important == 1 else "gray" for important, value in zip(importance_mask, forest_importances)]

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x=forest_importances.index, y=forest_importances.values,
            palette=color_important)
plt.title("Feature importances using MDI")
plt.ylabel("Importance")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
result = permutation_importance(
    rf_clf, X_train_transformed, y_train, n_repeats=10, random_state=42)

forest_importances_permutation = pd.Series(result.importances_mean, index=feature_names)
forest_importances_permutation.sort_values(ascending=False, inplace=True)

In [None]:
permutation_importance_mask = np.where(forest_importances > .03, 1, 0)
color_important_permutation = ["lightcoral" if important == 1 else "gray" for important, value in zip(permutation_importance_mask, forest_importances)]

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x=forest_importances_permutation.index, y=forest_importances_permutation.values,
            palette=color_important_permutation)
plt.title("Feature importances using permutation on full model")
plt.ylabel("Mean accuracy decrease")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
corr = spearmanr(X_train_transformed).correlation
corr_linkage = hierarchy.ward(corr)
dendro = hierarchy.dendrogram(
    corr_linkage, labels=feature_names, ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro['ivl']))

ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()

In [None]:
cluster_ids = hierarchy.fcluster(corr_linkage, 2, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

X_train_sel = X_train_transformed[:, selected_features]
X_test_sel = X_test_transformed[:, selected_features]

clf_sel = RandomForestClassifier(n_estimators=100, random_state=42)
clf_sel.fit(X_train_sel, y_train)
print("Accuracy on Validation data with features removed: {:.2f} %".format(
      clf_sel.score(X_test_sel, y_test) * 100))

In [None]:
print("Selected Features: {}".format(len(selected_features)))

In [None]:
print("Original Features: {}".format(X_train_transformed.shape[1]))

In [None]:
!pip install mlxtend

# DT, ADA, RF

In [None]:
from mlxtend.classifier import StackingCVClassifier


svm_clf = SVC(random_state=42, probability=True)
knn_clf = KNeighborsClassifier(n_neighbors=5)
rf_clf = RandomForestClassifier(random_state=1)
ada_clf = AdaBoostClassifier(random_state=1)
dt_clf = DecisionTreeClassifier(random_state=1,
                              max_depth=None)

lr = LogisticRegression(random_state=1)

sclf_1 = StackingCVClassifier(classifiers=[dt_clf, ada_clf, rf_clf], 
                          meta_classifier=dt_clf, cv=10)


sclf_1.fit(X_train_sel, y_train)
print("Train Accuracy: %0.4f" % sclf_1.score(X_train_sel, y_train))
print("Test Accuracy: %0.4f" % sclf_1.score(X_test_sel, y_test))
dt_ada_rf_acc = sclf_1.score(X_test_sel, y_test)

# SVM, RF, DT

In [None]:
sclf_2 = StackingCVClassifier(classifiers=[svm_clf, rf_clf], 
                          meta_classifier=dt_clf, cv=10)


sclf_2.fit(X_train_sel, y_train)
print("Train Accuracy: %0.4f" % sclf_2.score(X_train_sel, y_train))
print("Test Accuracy: %0.4f" % sclf_2.score(X_test_sel, y_test))
svm_rf_dt = sclf_2.score(X_test_sel, y_test)

# LDA, QDA, ANN

In [None]:
sclf_3 = StackingCVClassifier(classifiers=[lda_clf, qda_clf], 
                          meta_classifier=ann_clf, cv=10)


sclf_3.fit(X_train_sel, y_train)
print("Train Accuracy: %0.4f" % sclf_3.score(X_train_sel, y_train))
print("Test Accuracy: %0.4f" % sclf_3.score(X_test_sel, y_test))
lda_qda_ann = sclf_3.score(X_test_sel, y_test)

In [None]:
stacking_classifiers = [sclf_1, sclf_2, sclf_3]
name_of_stacking = ["DT, ADA, RF", "SVM, RF, DT", "LDA, QDA, ANN"]
stacking_auc_scores = []
stacking_ks = []
for classifier, name in zip(stacking_classifiers, name_of_stacking):
    clf = classifier.fit(X_train_sel, y_train)
    ks_score = kstest(y_test, clf.predict_proba(X_test_sel)[:, 1])
    stacking_auc_scores.append(roc_auc_score(y_test, clf.predict_proba(X_test_sel)[:, 1]))
    stacking_ks.append(ks_score.statistic)

In [None]:
plt.figure(figsize=(12, 8))
plt.bar(name_of_stacking, stacking_auc_scores, color="lightcoral")
xlocs, xlabs = plt.xticks()
for i, v in enumerate(np.round(stacking_auc_scores, decimals=2)):
    plt.text(xlocs[i], v - 0.03, str(v), ha = 'center', color="white")
plt.xticks(rotation=90)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("16 Features Stacking AUC Score")
plt.ylim(0.5, 1)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
plt.bar(name_of_stacking, stacking_ks, color="lightblue")
xlocs, xlabs = plt.xticks()
for i, v in enumerate(np.round(stacking_ks, decimals=4)):
    plt.text(xlocs[i], v - 0.03, str(v), ha = 'center', color="black")
plt.xticks(rotation=90)
plt.xlabel("Models")
plt.ylabel("KS Accuracy")
plt.title("16 Features Stacking KS Score")
plt.tight_layout()
plt.show()