In [1]:
%%html
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>

In [2]:
%%html
<script>
  $(document).ready(function(){
    $('div.prompt').hide();
    $('div.back-to-top').hide();
    $('nav#menubar').hide();
    $('.breadcrumb').hide();
    $('.hidden-print').hide();
  });
</script>

<footer id="attribution" style="float:right; color:#999; background:#fff;">
</footer>

In [85]:
# !pip install pandas
# !pip install seaborn
# !pip install plotly
# !pip install psutil
# !pip install category_encoders
# !pip install chart_studio
# !pip install numpy Cython
# !pip install -U pymrmr
# !pip install kfda


In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
import plotly.io as pio
import plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy.spatial import distance
import category_encoders as ce
import os
import shutil
import math
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.metrics import mean_squared_error
from scipy.cluster import hierarchy as hc
# for mRMR implementation
import pymrmr
# for kfda implementation
import kfda

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

if not os.path.exists("./report/images"):
    os.mkdir("./report/images")
images_folder = "./report/images"
colors = plotly.colors.DEFAULT_PLOTLY_COLORS

pd.set_option('display.max_columns', None) # prevents abbreviation (with '...') of columns in prints

SAVE_IMAGES = True
random_state = 42

In [5]:
def get_layout(title, height=1200, width=1100, y_title=None, x_title=None, barmode=None, showlegend=False, x_tickangle=0):
    return go.Layout(
    title=dict(text=title, xref="paper", x=0.5),
    autosize=True,
    width=width,
    height=height,
    margin=dict(t=50,l=10,b=10,r=10),
    showlegend=showlegend,
    yaxis=dict(
        title=y_title,
        automargin=True,
    ),
    xaxis=dict(
        title=x_title,
        tickangle=x_tickangle,
        automargin=True
    ),
    barmode=barmode,
)

def get_boxplot_trace(name, data, color = 'rgb(8,81,156)' , x=None, boxpoints='suspectedoutliers'):
    return go.Box(
        y=data,
        x=x,
        name=name,
        boxpoints=boxpoints,
        marker=dict(
        color=color,
        outliercolor='rgba(219, 64, 82, 0.6)',
        line=dict(
            outliercolor='rgba(219, 64, 82, 0.6)',
            outlierwidth=2)),
    line_color=color) 

# Reading csv

In [6]:
# reading csv
df = pd.read_csv('../SouthGermanCredit/SouthGermanCredit.asc')

# small changes to dataset
df["credit_history"] = df["credit_history"].map(lambda x : x+1)
df["purpose"] = df["purpose"].map(lambda x : x+1)
df["installment_rate"] = df["installment_rate"].map(lambda x: 5-x)
df["credit_risk"] = df["credit_risk"].map(lambda x : 1-x) # inverting class labels

# print data stats
df.describe()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,present_residence,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.577,20.903,3.545,3.828,3271.248,2.105,3.384,2.027,2.682,1.145,2.845,2.358,35.542,2.675,1.928,1.407,2.904,1.845,1.404,1.963,0.3
std,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,0.477706,1.103718,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856,0.458487
min,1.0,4.0,1.0,1.0,250.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,1.0,12.0,3.0,2.0,1365.5,1.0,3.0,1.0,2.0,1.0,2.0,1.0,27.0,3.0,2.0,1.0,3.0,2.0,1.0,2.0,0.0
50%,2.0,18.0,3.0,3.0,2319.5,1.0,3.0,2.0,3.0,1.0,3.0,2.0,33.0,3.0,2.0,1.0,3.0,2.0,1.0,2.0,0.0
75%,4.0,24.0,5.0,4.0,3972.25,3.0,5.0,3.0,3.0,1.0,4.0,3.0,42.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,1.0
max,4.0,72.0,5.0,11.0,18424.0,5.0,5.0,4.0,4.0,3.0,4.0,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,1.0


In [7]:
mappings = {1: "bad", 0:"good"}

y =  df["credit_risk"].value_counts()
data = [go.Bar(x=[mappings[x] for x in y.index], y=y.values, marker = dict(color = colors[:len(y.index)]))]
layout = get_layout("Credit risk", y_title='#samples', height=500, width=500)
fig = go.Figure(data=data, layout=layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/class.png")

# Encoding features

In [8]:
#binary encoding for "purpose" feature
encoder = ce.BinaryEncoder(cols=["purpose"], drop_invariant=True)
df_binary = encoder.fit_transform(df["purpose"])

# one-hot-enoding for the other categorical features with more than 2 labels
features = ["other_debtors", "other_installment_plans", "housing", "personal_status_sex"]
pre_df = pd.get_dummies(df, columns=features)
pre_df.drop("purpose", inplace=True, axis=1)
pre_df = pre_df.join(df_binary)

# other_debtors = 'none' is encoded with both other_debtors_1 and other_debtors_2 set to 0
pre_df.drop("other_debtors_1", inplace=True, axis=1)
pre_df.rename(columns={"other_debtors_2":"other_debtors_1", "other_debtors_3":"other_debtors_2"}, inplace=True)

# Trick to put the class feature as last column for convenience
pre_df = pre_df.join(pre_df.pop("credit_risk"))

pre_df.describe() 



is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



Unnamed: 0,status,duration,credit_history,amount,savings,employment_duration,installment_rate,present_residence,property,age,number_credits,job,people_liable,telephone,foreign_worker,other_debtors_1,other_debtors_2,other_installment_plans_1,other_installment_plans_2,other_installment_plans_3,housing_1,housing_2,housing_3,personal_status_sex_1,personal_status_sex_2,personal_status_sex_3,personal_status_sex_4,purpose_1,purpose_2,purpose_3,purpose_4,credit_risk
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.577,20.903,3.545,3271.248,2.105,3.384,2.027,2.845,2.358,35.542,1.407,2.904,1.845,1.404,1.963,0.041,0.052,0.139,0.047,0.814,0.179,0.714,0.107,0.05,0.31,0.548,0.092,0.071,0.417,0.374,0.453,0.3
std,1.257638,12.058814,1.08312,2822.75176,1.580023,1.208306,1.118715,1.103718,1.050209,11.35267,0.577654,0.653614,0.362086,0.490943,0.188856,0.198389,0.222138,0.34612,0.211745,0.389301,0.383544,0.452115,0.309268,0.218054,0.462725,0.49794,0.289171,0.256953,0.49331,0.484106,0.498035,0.458487
min,1.0,4.0,1.0,250.0,1.0,1.0,1.0,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,12.0,3.0,1365.5,1.0,3.0,1.0,2.0,1.0,27.0,1.0,3.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,18.0,3.0,2319.5,1.0,3.0,2.0,3.0,2.0,33.0,1.0,3.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,24.0,5.0,3972.25,3.0,5.0,3.0,4.0,3.0,42.0,2.0,3.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
max,4.0,72.0,5.0,18424.0,5.0,5.0,4.0,4.0,4.0,75.0,4.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Features distribution

In [9]:
numerical_features = ["age", "duration", "amount"]
titles = tuple(df.keys()[0:-1])
layout = get_layout("Features' distributions")
fig = make_subplots(rows=5, cols=4, subplot_titles=titles, vertical_spacing=0.04, horizontal_spacing=0.04)
for index,feature in enumerate(titles):
    if feature not in numerical_features:
        y = df[feature].value_counts()
        fig.add_trace(go.Bar(x=[x for x in y.index], y=y.values, marker = dict(color = colors[0]), name=feature) ,row=index//4+1, col=index%4+1)
    else:
        trace = go.Histogram(x=df[feature], nbinsx=20, marker = dict(color = colors[0]), name=feature )
        fig.add_trace(trace ,row=index//4+1, col=index%4+1)
fig.update_layout(layout)
fig.show()     
if SAVE_IMAGES: 
    fig.write_image(f"{images_folder}/distributions.png")

In [10]:
numerical_features = ["age", "duration", "amount"]
titles = tuple(pre_df.keys()[0:-1])
rows = math.ceil(len(titles)/4)
layout = get_layout("Features distribution", height=1200/4*rows)
fig = make_subplots(rows=rows, cols=4, subplot_titles=titles, vertical_spacing=0.03, horizontal_spacing=0.04)
for index,feature in enumerate(titles):
    if feature not in numerical_features:
        y = pre_df[feature].value_counts()
        fig.add_trace(go.Bar(x=[x for x in y.index], y=y.values, marker = dict(color = colors[0])) ,row=index//4+1, col=index%4+1)
    else:
        trace = go.Histogram(x=pre_df[feature], nbinsx=20, marker = dict(color = colors[0]) )
        fig.add_trace(trace ,row=index//4+1, col=index%4+1)
fig.update_layout(layout)
fig.show()      
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/distributions_preprocessed.png")

## Features distribution per class

In [11]:
numerical_features = ["age", "duration", "amount"]
titles = tuple(df.keys()[0:-1])
layout = get_layout("Features distribution per class", barmode='overlay', showlegend=True)
fig = make_subplots(rows=5, cols=4, subplot_titles=titles, vertical_spacing=0.04, horizontal_spacing=0.04)
for index,feature in enumerate(titles):
    if feature not in numerical_features:
        feature_risk_df = df.groupby([feature, "credit_risk"]).size().unstack()
        fig.add_trace(go.Bar(x=feature_risk_df.index, y=feature_risk_df[0], marker = dict(color = colors[0]), legendgroup="good", showlegend=(feature==titles[0]),name="good"), row=index//4+1, col=index%4+1)
        fig.add_trace(go.Bar(x=feature_risk_df.index, y=feature_risk_df[1], marker = dict(color = colors[1]),legendgroup="bad", showlegend=(feature==titles[0]), name="bad",opacity=0.8), row=index//4+1, col=index%4+1)
    else:
        trace = go.Histogram(x=df[df["credit_risk"] == 0][feature], nbinsx=20, marker = dict(color = colors[0]),legendgroup="good", showlegend=False, name="good")
        fig.add_trace(trace ,row=index//4+1, col=index%4+1)
        trace = go.Histogram(x=df[df["credit_risk"] == 1][feature], nbinsx=20, marker = dict(color = colors[1]),legendgroup="bad", showlegend=False, name="bad", opacity=0.8)
        fig.add_trace(trace ,row=index//4+1, col=index%4+1)
fig.update_layout(layout)
fig.show()      
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/distributions_by_class.png")

# Missing values

In [12]:
print("There are " + ("some" if df.isnull().values.any() else "no")  + " missing values in the dataset.")

There are no missing values in the dataset.


# Outliers detection

In [13]:
titles = tuple(df.keys()[0:-1])
layout = get_layout("Box plots")
fig = make_subplots(rows=5, cols=4, subplot_titles = titles, vertical_spacing=0.04, horizontal_spacing=0.04)
for index,feature in enumerate(titles):
    fig.add_trace(get_boxplot_trace(feature, df[feature]) ,row=index//4+1, col=index%4+1)
fig.update_layout(layout)
fig.show()     
if SAVE_IMAGES:
    fig.write_image(f"{images_folder}/boxplots.png")
 

In [14]:
# boxplot with class


titles = ["duration", "amount", "age"]
label = "credit_risk"
cols = len(titles)
fig = make_subplots(rows=1, cols=cols, subplot_titles = titles, vertical_spacing=0.05)
layout = get_layout("Box Plots", height=400, width=1100)
for index,feature in enumerate(titles):
    fig.add_trace( get_boxplot_trace(feature, df[feature], color=colors[0]), row=1, col = index+1)
    
fig.update_layout(layout)
fig.show() 
   
if SAVE_IMAGES:
    # pio.write_image(fig, f"{images_folder}/significant_boxplots.png")
    fig.write_image(f"{images_folder}/significant_boxplots.png")

In [15]:
# boxplot with class

titles = ["duration", "amount", "age"]
label = "credit_risk"

cols = len(titles)
rows=1
fig = make_subplots(rows=rows, cols=cols, subplot_titles = titles, vertical_spacing=0.05)
layout = get_layout("Box plots with class", width=350*cols, height=500)
for index,feature in enumerate(titles):
    tmp_df = df[[feature, label]]
    fig.add_trace( get_boxplot_trace(feature, tmp_df[tmp_df[label] == 0][feature], x=np.repeat("good", 600), color=colors[0]), col=index+1, row=1)
    fig.add_trace( get_boxplot_trace(feature, tmp_df[tmp_df[label] == 1][feature], x=np.repeat("bad", 300), color=colors[1]), col=index+1, row=1)
   
fig.update_layout(layout)
fig.show() 
   
if SAVE_IMAGES:
    fig.write_image(f"{images_folder}/boxplots_with_classes.png")
 


In [16]:
titles = tuple(pre_df.keys()[0:-1])
rows = math.ceil(len(titles)/4)
fig = make_subplots(rows=rows, cols=4, subplot_titles = titles, vertical_spacing=0.025, horizontal_spacing=0.04)
layout = get_layout("Box plots", height=1200/5 * rows)
for index,feature in enumerate(titles):
    # print(f"row={index//4+1}, col={index%4+1}")
    fig.add_trace(get_boxplot_trace(feature, pre_df[feature]) ,row=index//4+1, col=index%4+1)
fig.update_layout(layout)
fig.show()     
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/boxplots_preprocessed.png")
 

In [17]:
mean = pre_df.mean()[:-1].values
data = np.array(pre_df)[:,:-1]
cov = np.cov(data.T)
icov = np.linalg.inv(cov)
m_distances = [distance.mahalanobis(sample, mean, icov) for sample in data]

layout = get_layout('Box plot of Mahalanobis distance', height=500, width = 500)
fig= go.Figure(get_boxplot_trace("Mahalanobis distance",m_distances), layout = layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/mahalanobis.png")

In [18]:
# violin plot of mahalanobis distance
layout=get_layout("Violin plot of Mahalanobis distance",height=500, width = 500)
fig= go.Figure(go.Violin(y=m_distances,
                        name="Mahalanobis distance",
                        box_visible=True,
                        meanline_visible=True
                    ), layout = layout
                )
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/mahalanobis_violin.png")

## Evaluating multidimensional outliers' class  label

In [19]:
mask = [True if distance >= 8.30 else False for distance in m_distances]
outliers = pre_df[mask]
print(outliers["credit_risk"])

323    0
645    0
711    0
754    1
935    1
976    1
977    1
Name: credit_risk, dtype: int64


# Feature - target split

In [20]:
df_y = pre_df["credit_risk"]
df_X = pre_df.drop(["credit_risk"], axis=1)

# Normalizing features

In [21]:
df_X_normalized  = (df_X - df_X.mean()) / (df_X.max() - df_X.min())
df_X_normalized.describe()

Unnamed: 0,status,duration,credit_history,amount,savings,employment_duration,installment_rate,present_residence,property,age,number_credits,job,people_liable,telephone,foreign_worker,other_debtors_1,other_debtors_2,other_installment_plans_1,other_installment_plans_2,other_installment_plans_3,housing_1,housing_2,housing_3,personal_status_sex_1,personal_status_sex_2,personal_status_sex_3,personal_status_sex_4,purpose_1,purpose_2,purpose_3,purpose_4
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.842171e-17,2.1316280000000002e-17,1.7763570000000002e-17,1.776357e-18,-2.4869000000000002e-17,8.881784e-18,-3.375078e-17,-7.01661e-17,-2.842171e-17,-3.0198070000000005e-17,-1.609823e-17,2.842171e-17,3.641532e-17,8.526513e-17,-7.81597e-17,8.881783999999999e-19,-2.1316280000000002e-17,-1.509903e-17,-4.440892e-18,2.664535e-17,-1.1546320000000001e-17,2.842171e-17,0.0,-7.105427e-18,-3.153033e-17,-4.2632560000000003e-17,2.6645350000000002e-18,1.4210850000000002e-17,-3.552714e-18,-3.907985e-17,-4.6185280000000004e-17
std,0.4192126,0.1773355,0.2707799,0.1553181,0.3950057,0.3020766,0.3729049,0.367906,0.3500697,0.2027263,0.1925515,0.2178713,0.3620858,0.490943,0.1888562,0.1983894,0.2221381,0.3461196,0.2117447,0.3893014,0.3835441,0.4521155,0.309268,0.218054,0.4627247,0.4979397,0.2891706,0.2569534,0.4933096,0.4841057,0.4980352
min,-0.5256667,-0.2485735,-0.63625,-0.1662401,-0.27625,-0.596,-0.3423333,-0.615,-0.4526667,-0.2953929,-0.1356667,-0.6346667,-0.845,-0.404,-0.963,-0.041,-0.052,-0.139,-0.047,-0.814,-0.179,-0.714,-0.107,-0.05,-0.31,-0.548,-0.092,-0.071,-0.417,-0.374,-0.453
25%,-0.5256667,-0.1309265,-0.13625,-0.1048612,-0.27625,-0.096,-0.3423333,-0.2816667,-0.4526667,-0.1525357,-0.1356667,0.032,0.155,-0.404,0.037,-0.041,-0.052,-0.139,-0.047,0.186,-0.179,-0.714,-0.107,-0.05,-0.31,-0.548,-0.092,-0.071,-0.417,-0.374,-0.453
50%,-0.1923333,-0.04269118,-0.13625,-0.05236866,-0.27625,-0.096,-0.009,0.05166667,-0.1193333,-0.04539286,-0.1356667,0.032,0.155,-0.404,0.037,-0.041,-0.052,-0.139,-0.047,0.186,-0.179,0.286,-0.107,-0.05,-0.31,0.452,-0.092,-0.071,-0.417,-0.374,-0.453
75%,0.4743333,0.04554412,0.36375,0.0385717,0.22375,0.404,0.3243333,0.385,0.214,0.1153214,0.1976667,0.032,0.155,0.596,0.037,-0.041,-0.052,-0.139,-0.047,0.186,-0.179,0.286,-0.107,-0.05,0.69,0.452,-0.092,-0.071,0.583,0.626,0.547
max,0.4743333,0.7514265,0.36375,0.8337599,0.72375,0.404,0.6576667,0.385,0.5473333,0.7046071,0.8643333,0.3653333,0.155,0.596,0.037,0.959,0.948,0.861,0.953,0.186,0.821,0.286,0.893,0.95,0.69,0.452,0.908,0.929,0.583,0.626,0.547


# t-SNE

In [22]:
n_components = 2
tsne = TSNE(n_components)
tsne_result = tsne.fit_transform(df_X_normalized)
tsne_df = pd.DataFrame({'tsne_1': tsne_result[:,0], 'tsne_2': tsne_result[:,1]})

fig= go.Figure(go.Scatter(x=tsne_df["tsne_1"], y=tsne_df["tsne_2"],
                    mode='markers',
                    name='markers',
                    marker = dict(color = df_y) 
                    )
              )
fig.show()

# Train-test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df_X_normalized, df_y, test_size=0.20, random_state=random_state, stratify=df_y) # TODO check stratify

In [24]:
mappings = {1: "bad", 0:"good"}

y =  y_train.value_counts()
data = [go.Bar(x=[mappings[x] for x in y.index], y=y.values, marker = dict(color = colors[:len(y.index)]))]
layout = get_layout("Credit risk - training set", y_title='#samples', height=500, width=500)
fig = go.Figure(data=data, layout=layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/training_set.png")

y =  y_test.value_counts()
data = [go.Bar(x=[mappings[x] for x in y.index], y=y.values, marker = dict(color = colors[:len(y.index)]))]
layout = get_layout("Credit risk - test set", y_title='#samples', height=500, width=500)
fig = go.Figure(data=data, layout=layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/test_set.png")

# Dimensionality reduction

## Features correlation

In [25]:
correlation = df_X_normalized.corr()
layout = get_layout("Features correlation with Pearson", showlegend=True, width=1000, height=800, x_tickangle=40)
fig = go.Figure(data=go.Heatmap(
                   z=correlation,
                   x=df_X_normalized.keys(),
                   y=df_X_normalized.keys(),
                   colorscale='Viridis',
                   hoverongaps = False))
fig.update_layout(layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/correlation_pearson.png")

In [26]:
names = df_X_normalized.columns
inverse_correlation = 1 - abs(df_X_normalized.corr())
fig = ff.create_dendrogram(inverse_correlation.values, orientation='left', labels=names, colorscale=colors, linkagefun=lambda x: hc.linkage(x, 'average'))
layout = get_layout("Dendrogram of pairwise average distance (Pearson) ", width=850, height=600, x_title="distance", y_title="features")
fig.update_layout(layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/dendrogram_pearson.png")

In [27]:
correlation = df_X_normalized.corr(method="spearman")
layout = get_layout("Features correlation with Spearman", showlegend=True, width=1000, height=800, x_tickangle=40)
fig = go.Figure(data=go.Heatmap(
                   z=correlation,
                   x=df_X_normalized.keys(),
                   y=df_X_normalized.keys(),
                   colorscale='Viridis',
                   hoverongaps = False))
fig.update_layout(layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/correlation_spearman.png")

In [28]:
names = df_X_normalized.columns
inverse_correlation = 1 - abs(df_X_normalized.corr(method="spearman"))
fig = ff.create_dendrogram(inverse_correlation.values, orientation='left', labels=names, colorscale=colors, linkagefun=lambda x: hc.linkage(x, 'average'))
layout = get_layout("Dendrogram of pairwise average distance (Spearman) ", width=850, height=600, x_title="distance", y_title="features")
fig.update_layout(layout)
fig.show()
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/dendrogram_spearman.png")

## mRMR feature selection algorithm

implementation @ https://pypi.org/project/pymrmr/

ref: 
Hanchuan Peng, Fuhui Long, and Chris Ding, “Feature selection based on mutual information: criteria of max-dependency, max-relevance, and min-redundancy,” IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 27, No. 8, pp.1226-1238, 2005.

In [29]:
'''
@param data : DataFrame where we want to apply feature selection on, with the class label in the first column
@param method : can be MIQ or MID
@param nfeatures : number of features we want to extract
@return the dataset with the selected features
'''
def apply_mRMR(data, method="MIQ", nfeatures=15):
    selected_features = pymrmr.mRMR(data, method, nfeatures)
    mrmr_miq_df = data[selected_features]
    return mrmr_miq_df, selected_features

### mRMR selected features

In [30]:
complete_df = X_train.copy(deep=True)
complete_df.insert(0, "class", y_train)
_, selected_features = apply_mRMR(complete_df, method="MIQ")
print(selected_features)

_, selected_features = apply_mRMR(complete_df, method="MID")
print(selected_features)

['personal_status_sex_1', 'personal_status_sex_2', 'other_installment_plans_2', 'other_installment_plans_3', 'housing_1', 'housing_2', 'housing_3', 'other_debtors_2', 'other_debtors_1', 'personal_status_sex_3', 'personal_status_sex_4', 'purpose_1', 'purpose_2', 'purpose_3', 'other_installment_plans_1']
['personal_status_sex_1', 'personal_status_sex_2', 'other_installment_plans_2', 'other_installment_plans_3', 'housing_1', 'housing_2', 'housing_3', 'other_debtors_2', 'other_debtors_1', 'personal_status_sex_3', 'personal_status_sex_4', 'purpose_1', 'purpose_2', 'purpose_3', 'other_installment_plans_1']


## PCA

In [31]:
def apply_PCA(data, n_components=15):
    pca = PCA(n_components = n_components, svd_solver = "full", random_state = random_state)
    pca_df = pca.fit_transform(data)
    # print(pca.n_components_)
    
    return pca_df, pca 

### Evaluate PCA explained variance

In [32]:
_ , result = apply_PCA(X_train, n_components=len(X_train.keys()))
total_explained = sum(result.explained_variance_) 
list_explained = [(i / total_explained) * 100 for i in sorted(result.explained_variance_, reverse=True)]
cumulative_explained = np.cumsum(list_explained) 

layout = get_layout("Scree plot", height=500, x_title="principal components", showlegend=True)
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(
        x=list(range(1, len(list_explained) + 1)),
        y=list_explained, 
        mode='lines+markers',
        name="explained variance"
        ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=list(range(1, len(cumulative_explained) + 1)),
        y=cumulative_explained,
        mode='lines+markers',
        name="cumulative explained variance",
        line=dict(
            shape='hv',
        )
    ),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    layout
)

# Set y-axes titles
fig.update_yaxes(title_text="explained variance", secondary_y=False, range=[-0.72,15.72], tickmode="linear", tick0=0, dtick=3)
fig.update_yaxes(title_text="cumulative explained variance [%] ", secondary_y=True, showgrid = False, range=[-5,105], zeroline=False)

fig.show()

if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/scree_plot.png")

## KPCA

In [33]:
def apply_KPCA(data, n_components=15, kernel="poly", degree=5, gamma=0.001, coef0=10):
    pca = KernelPCA(n_components = n_components, kernel=kernel, random_state = random_state, degree=degree, gamma=gamma, coef0=coef0)
    pca_df = pca.fit_transform(data)
    # print(pca.n_components_)
    
    return pca_df, pca 

### Cross validate KPCA hyperparameters

In [34]:
def kpca_scorer(estimator, X, y=None):
    X_reduced = estimator.transform(X)
    X_preimage = estimator.inverse_transform(X_reduced)
    return -1 * mean_squared_error(X, X_preimage)
    # reconstruction error multiplied by -1 because the objective is to minimize it

param_grid = {
            "poly":[{"degree": [3, 5, 7, 9], "gamma": [0.001, 0.01, 0.03, 0.05 ], "coef0": [0.1, 1, 5, 10, 15, 20]}],
            "rbf":[{"gamma": [0.01, 0.1, 1, 5, 10]}],
            "sigmoid":[{"gamma": [0.1, 0.2, 0.3, 0.4], "coef0": [0.001, 0.01, 0.1, 1, 5, 10]}],
            "cosine":[{}]
                }

for kernel in ["poly", "rbf", "sigmoid", "cosine"]:
    print("\t",kernel)
    kpca=KernelPCA(fit_inverse_transform=True, n_jobs=-1, n_components=len(X_train.keys()), kernel = kernel) 
    grid_search = GridSearchCV(kpca, param_grid[kernel], cv=3, scoring=kpca_scorer)
    grid_search.fit(X_train)
    print("\t\t", grid_search.best_score_)
    print("\t\t", grid_search.best_estimator_)

	 poly
		 -5.4967446398026415e-08
		 KernelPCA(coef0=10, degree=5, fit_inverse_transform=True, gamma=0.001,
          kernel='poly', n_components=31, n_jobs=-1)
	 rbf
		 -0.04284249904829988
		 KernelPCA(fit_inverse_transform=True, gamma=0.1, kernel='rbf', n_components=31,
          n_jobs=-1)
	 sigmoid
		 -0.004175250562839808
		 KernelPCA(coef0=0.001, fit_inverse_transform=True, gamma=0.4, kernel='sigmoid',
          n_components=31, n_jobs=-1)
	 cosine
		 -0.005508960409910976
		 KernelPCA(fit_inverse_transform=True, kernel='cosine', n_components=31,
          n_jobs=-1)


### Evaluate KPCA

In [35]:
kernels =["poly", "rbf", "sigmoid", "cosine"]
titles = kernels
params = {
    "poly":{"degree":5, "gamma": 0.001, "coef0":10},
    "rbf":{"degree":None, "gamma": 0.1, "coef0":None},
    "sigmoid":{"degree":None, "gamma": 0.1, "coef0": 0.001},
    "cosine":{"degree":None, "gamma": None, "coef0": None}
}
layout = get_layout("Features distribution per class", barmode='overlay', showlegend=True)
layout = get_layout(f"Scree plots of KPCA", height=800, width=1200, showlegend=True)
fig = make_subplots(rows=2, cols=2, specs=[[{"secondary_y": True}, {"secondary_y": True}], [{"secondary_y": True}, {"secondary_y": True}]], \
    subplot_titles=titles, vertical_spacing=0.1, horizontal_spacing=0.1)
for index,kernel in enumerate(titles):
    _ , result = apply_KPCA(X_train, n_components=len(X_train.keys()), kernel=kernel,\
         gamma=params[kernel]["gamma"], degree=params[kernel]["degree"], coef0=params[kernel]["coef0"])
    total_explained = sum(result.lambdas_) 
    list_explained = [(i / total_explained) * 100 for i in sorted(result.lambdas_, reverse=True)]
    cumulative_explained = np.cumsum(list_explained) 

    # Add traces
    trace = go.Scatter(
            x=list(range(1, len(list_explained) + 1)),
            y=list_explained, 
            mode='lines+markers',
            name="explained variance",
            legendgroup="explained variance",
            showlegend=(kernel=="poly"),
            marker = dict(color = colors[0])
            )
    fig.add_trace(
        trace,
        secondary_y=False,
        row=index//2+1,
        col=index%2+1
    )

    trace=go.Scatter(
            x=list(range(1, len(cumulative_explained) + 1)),
            y=cumulative_explained,
            mode='lines+markers',
            name="cumulative explained variance",
            legendgroup="cumulative explained variance",
            showlegend=(kernel=="poly"),
            marker = dict(color = colors[1]),
            line=dict(
                shape='hv',
            )
        )

    fig.add_trace(
        trace,
        secondary_y=True,
        row=index//2+1,
        col=index%2+1
    )

    # Set y-axes titles
    fig.update_yaxes(title_text="explained variance", secondary_y=False, range=[-0.72,15.72], tickmode="linear", tick0=0, dtick=3)
    fig.update_yaxes(title_text="cumulative explained variance [%] ", secondary_y=True, showgrid = False, range=[-5,105], zeroline=False)

    # Set x-axis title
    fig.update_xaxes(title_text="principal components",)
    
fig.update_layout(layout)
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.2,
    xanchor="right",
    x=0.9
))
fig.show()      
if SAVE_IMAGES:
    pio.write_image(fig, f"{images_folder}/scree_plot_kpca.png")


# Balance dataset

In [36]:
def balance_dataset(X:pd.DataFrame, y:pd.DataFrame): # X pandas Dataframe
    n_samples_good_risk, n_samples_bad_risk = y.value_counts()
    delta = int((n_samples_good_risk- n_samples_bad_risk)/2)
    data = pd.DataFrame(X)
    print(n_samples_good_risk, n_samples_bad_risk, delta)
    data["credit_risk"] = y["credit_risk"]
    good_risk_undersampled = data.loc[data["credit_risk"] == 1].sample(frac=1)[0:n_samples_bad_risk+delta] # shuffle data and select a part of them
    bad_risk_oversampled = data.loc[data["credit_risk"] == 0].sample(n_samples_bad_risk+delta, replace=True)
    result = pd.concat([good_risk_undersampled, bad_risk_oversampled], axis=0)
    return result.drop(["credit_risk"], axis=1), result["credit_risk"]

# Setup classifiers and grid search parameters

In [80]:

classifiers = {
    "DecisionTreeClassifier":  DecisionTreeClassifier(random_state=random_state),
    "DecisionTreeClassifierWithPruning": DecisionTreeClassifier(random_state=random_state),
    "LogisticRegression": LogisticRegression(random_state=random_state, solver="liblinear")
}

classifiers_params = {
    "DecisionTreeClassifier": {
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth': np.arange(1,31,5).tolist(),
        'classifier__min_samples_split': [0.001, 0.01, 0.1, 0.25, 0.45],
        'classifier__min_samples_leaf': [0.001, 0.01, 0.1, 0.25, 0.45]
    },
    "DecisionTreeClassifierWithPruning": {
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__ccp_alpha': [0.0, 0.005, 0.001, 0.01, 0.015, 0.02, 0.025, 0.03, 0.05]
    },
    "LogisticRegression":{
        'classifier__penalty': ['l1', 'l2'], 
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    },
    "SupportVectorMachine": {
        'C': [0.1, 1, 10, 100], 
        "kernel":["linear","poly", "rbf", "sigmoid"],
        "degree":[3,5,7,10],
        "gamma":[0.001, 0.01, 0.1, 1],
        "coef0":[0.0, 0.001, 0.1, 1, 3, 5]        
    },
}

ensemble_classifiers={
    "BaggingClassifier": BaggingClassifier(random_state=random_state, base_estimator=DecisionTreeClassifier(random_state=random_state, max_depth=6, min_samples_leaf=0.0001, min_samples_split=0.45)),
    "BaggingClassifierWithPruning": BaggingClassifier(random_state=random_state, base_estimator=DecisionTreeClassifier(random_state=random_state, criterion="entropy", ccp_alpha=0.02)),
    "RandomForestClassifier": RandomForestClassifier(random_state=random_state, max_depth=6, min_samples_leaf=0.0001, min_samples_split=0.45),
    "RandomForestClassifierWithPruning": RandomForestClassifier(random_state=random_state, criterion="entropy", ccp_alpha=0.02),
}

ensemble_classifiers_params={
    "BaggingClassifier":{
        "classifier__max_samples":[0.25, 0.5, 0.75, 1.0],
        "classifier__n_estimators":[10, 100, 1000]
    },
    "BaggingClassifierWithPruning":{
        "classifier__max_samples":[0.25, 0.5, 0.75, 1.0],
        "classifier__n_estimators":[10, 100, 1000]
    },
    "RandomForestClassifier":{
        "classifier__n_estimators":[10, 100, 1000]
    },
    "RandomForestClassifierWithPruning":{
        "classifier__n_estimators":[10, 100, 1000]
    }
}



# Cross validation pipeline

In [81]:
def apply_kfold_cross_validation(algorithm, params, X_train, y_train):

    complete_df = X_train.copy(deep=True)
    complete_df.insert(0, "class", y_train)

    pca_df, _ = apply_PCA(X_train)
    kpca_df, _ = apply_KPCA(X_train)
    mRMR_MIQ_df, _ = apply_mRMR(complete_df, method="MIQ")
    mRMR_MID_df, _ = apply_mRMR(complete_df, method="MID")
    dim_reduction_methods = {
        "PCA":pca_df, 
        "KPCA": kpca_df,
        "mRMR_MIQ":mRMR_MIQ_df, 
        "mRMR_MID":mRMR_MID_df,
        "none": X_train
        }
    balancing_dataset_methods = {
        "undersampling": RandomUnderSampler(random_state=random_state), 
        "oversampling": RandomOverSampler(random_state=random_state)
    }

    for dim_reduction_method in dim_reduction_methods.keys():
        print("\t",dim_reduction_method)
        for balancing_method in balancing_dataset_methods.keys():
            print("\t\t",balancing_method)
            
            pipeline = Pipeline([("balancing",balancing_dataset_methods[balancing_method]),("classifier",algorithm)])
  
            kf_total = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
            gs = GridSearchCV(estimator=pipeline, param_grid=params, n_jobs=1, cv=kf_total, scoring='f1')
            gs.fit(dim_reduction_methods[dim_reduction_method], y_train)
        
            print("\t\t\t",gs.best_score_)
            print("\t\t\t",gs.best_estimator_)


In [82]:
for classifier_name, classifier  in classifiers.items():
    print("\t",classifier_name)
    apply_kfold_cross_validation(classifier, classifiers_params[classifier_name], X_train, y_train)

	 LogisticRegression
	 PCA
		 undersampling
			 0.6007785958389652
			 Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),
                ('classifier',
                 LogisticRegression(C=1, random_state=42, solver='liblinear'))])
		 oversampling
			 0.5922377998544562
			 Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),
                ('classifier',
                 LogisticRegression(C=0.1, penalty='l1', random_state=42,
                                    solver='liblinear'))])
	 KPCA
		 undersampling
			 0.6041329048565728
			 Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),
                ('classifier',
                 LogisticRegression(C=0.1, penalty='l1', random_state=42,
                                    solver='liblinear'))])
		 oversampling
			 0.6060321260805552
			 Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),
                ('classifier',
                 LogisticRegression(C=0.01, penalty

In [83]:
# ensemble cross validation
balancing_methods = {
        "BaggingClassifier": RandomUnderSampler(random_state=random_state), 
        "BaggingClassifierWithPruning": RandomOverSampler(random_state=random_state),
        "RandomForestClassifier" : RandomUnderSampler(random_state=random_state), 
        "RandomForestClassifierWithPruning" : RandomOverSampler(random_state=random_state)
        }
for classifier_name, classifier  in ensemble_classifiers.items():
    print("\t",classifier_name)
    
    pipeline = Pipeline([("balancing",balancing_methods[classifier_name]),("classifier",classifier)])

    kf_total = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
    gs = GridSearchCV(estimator=pipeline, param_grid=ensemble_classifiers_params[classifier_name], n_jobs=1, cv=kf_total, scoring='f1')
    gs.fit(X_train, y_train)

    print("\t\t",gs.best_score_)
    print("\t\t",gs.best_estimator_)

	 RandomForestClassifier
		 0.5968759981084393
		 Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),
                ('classifier',
                 RandomForestClassifier(max_depth=6, min_samples_leaf=0.0001,
                                        min_samples_split=0.45,
                                        random_state=42))])
	 RandomForestClassifierWithPruning
		 0.6017959759265026
		 Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),
                ('classifier',
                 RandomForestClassifier(ccp_alpha=0.02, criterion='entropy',
                                        random_state=42))])


In [57]:
# da rifare!!!

# best_pipelines = {
#     "DecisionTreeClassifier": { "dim_reduction" : None,
#                                 "pipeline" : Pipeline(steps=[
#                                             ('balancing', SMOTE(random_state=42)),
#                                             ('classifier', DecisionTreeClassifier(
#                                                 criterion='entropy', 
#                                                 max_depth=15,
#                                                 max_features='log2', min_samples_leaf=4,
#                                                 min_samples_split=25,
#                                                 random_state=42)
#                                             )])
#                                 }
# }

Dectree1
          "\t PCA\n",
      "\t\t undersampling\n",
      "\t\t\t 0.527481022355779\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(criterion='entropy', max_depth=11,\n",
      "                                        min_samples_leaf=0.001,\n",
      "                                        min_samples_split=0.25,\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5193381303377957\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, max_features='sqrt',\n",
      "                                        min_samples_leaf=0.001,\n",
      "                                        min_samples_split=0.1,\n",
      "                                        random_state=42))])\n",
      "\t KPCA\n",
      "\t\t undersampling\n",
      "\t\t\t 0.5301185898216286\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, min_samples_leaf=0.001,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5291300137854493\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(criterion='entropy', max_depth=6,\n",
      "                                        max_features='sqrt',\n",
      "                                        min_samples_leaf=0.001,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t mRMR_MIQ\n",
      "\t\t undersampling\n",
      "\t\t\t 0.48254430485608807\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, max_features='sqrt',\n",
      "                                        min_samples_leaf=0.1,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.48254430485608807\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, max_features='sqrt',\n",
      "                                        min_samples_leaf=0.1,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t mRMR_MID\n",
      "\t\t undersampling\n",
      "\t\t\t 0.48254430485608807\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, max_features='sqrt',\n",
      "                                        min_samples_leaf=0.1,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.48254430485608807\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, max_features='sqrt',\n",
      "                                        min_samples_leaf=0.1,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t none\n",
      "\t\t undersampling\n",
      "\t\t\t 0.5752128154080384\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=1, min_samples_leaf=0.001,\n",
      "                                        min_samples_split=0.001,\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5841541668420481\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(max_depth=6, min_samples_leaf=0.001,\n",
      "                                        min_samples_split=0.45,\n",
      "                                        random_state=42))])\n",
DecTree2 
      "\t PCA\n",
      "\t\t undersampling\n",
      "\t\t\t 0.5133153332897255\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.015, random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5177036833779486\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.015, criterion='entropy',\n",
      "                                        max_features='sqrt',\n",
      "                                        random_state=42))])\n",
      "\t KPCA\n",
      "\t\t undersampling\n",
      "\t\t\t 0.5156378623514299\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.005, random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5173520435943036\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.01, criterion='entropy',\n",
      "                                        max_features='sqrt',\n",
      "                                        random_state=42))])\n",
      "\t mRMR_MIQ\n",
      "\t\t undersampling\n",
      "\t\t\t 0.44486129578048794\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.001, max_features='sqrt',\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.48262213503580753\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.005, criterion='entropy',\n",
      "                                        max_features='sqrt',\n",
      "                                        random_state=42))])\n",
      "\t mRMR_MID\n",
      "\t\t undersampling\n",
      "\t\t\t 0.44486129578048794\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.001, max_features='sqrt',\n",
      "                                        random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.48262213503580753\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.005, criterion='entropy',\n",
      "                                        max_features='sqrt',\n",
      "                                        random_state=42))])\n",
      "\t none\n",
      "\t\t undersampling\n",
      "\t\t\t 0.5752128154080384\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.025, random_state=42))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5913688690306956\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 DecisionTreeClassifier(ccp_alpha=0.02, criterion='entropy',\n",
      "                                        random_state=42))])\n"


BaggingClassifier
      "\t\t 0.5752128154080384\n",
      "\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=6,\n",
      "                                                                         min_samples_leaf=0.0001,\n",
      "                                                                         min_samples_split=0.45,\n",
      "                                                                         random_state=42),\n",
      "                                   max_samples=0.75, n_estimators=100,\n",
      "                                   random_state=42))])\n",
BaggingClassifierWithPruning
      "\t\t 0.6159023154039892\n",
      "\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.02,\n",
      "                                                                         criterion='entropy',\n",
      "                                                                         random_state=42),\n",
      "                                   max_samples=0.25, n_estimators=1000,\n",
      "                                   random_state=42))])\n",
RandomForestClassifier
      0.5968759981084393
      Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),
      ('classifier',
      RandomForestClassifier(max_depth=6, min_samples_leaf=0.0001,
                              min_samples_split=0.45,
                              random_state=42))])
RandomForestClassifierWithPruning
      0.6017959759265026
      Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),
      ('classifier',
      RandomForestClassifier(ccp_alpha=0.02, criterion='entropy',
                              random_state=42))])
LogisticRegression
      "\t PCA\n",
      "\t\t undersampling\n",
      "\t\t\t 0.6007785958389652\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=1, random_state=42, solver='liblinear'))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.5922377998544562\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.1, penalty='l1', random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t KPCA\n",
      "\t\t undersampling\n",
      "\t\t\t 0.6041329048565728\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.1, penalty='l1', random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.6060321260805552\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.01, penalty='l1', random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t mRMR_MIQ\n",
      "\t\t undersampling\n",
      "\t\t\t 0.48215144438719504\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.001, random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.4744469545221041\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.1, random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t mRMR_MID\n",
      "\t\t undersampling\n",
      "\t\t\t 0.48215144438719504\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.001, random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.4744469545221041\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.1, random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t none\n",
      "\t\t undersampling\n",
      "\t\t\t 0.6256364275501042\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomUnderSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=1, penalty='l1', random_state=42,\n",
      "                                    solver='liblinear'))])\n",
      "\t\t oversampling\n",
      "\t\t\t 0.6219178291774577\n",
      "\t\t\t Pipeline(steps=[('balancing', RandomOverSampler(random_state=42)),\n",
      "                ('classifier',\n",
      "                 LogisticRegression(C=0.1, random_state=42,\n",
      "                                    solver='liblinear'))])\n"

dim reduction prima o dopo data augmentation (downsampling, upsampling, smote) -> prima dim reduction perchè il balancing cambia la distribuzione de dati, e cambierebbe la dim reduction che si basa sulla dstribuzione dei dati

- train-test split
- train-val split
- dim reduction on training
- updsampling/downsampling 

considerando la cross-validation
- train-test split stratified
- per ogni divisione in 1 - (k-1) fold stratificata
- bilanciare (k-1) 
- applicare dimensionality reduction
- applicare l'algoritmo
- testare sull'1 fold

- per ogni classificatore:
    - per ogni tipo di dim reduction:
        - applicarla a tutto il training set,
        - poi, per ogni tipo di balancing (under, over, smote di imbalanced-learn):
            - fare la kfold cross validation con quel tipo di balancing e veere il risultato migliore della f1
    - alla fine prendere il risultato migliore globale 
    - eventualmente fare osservazioni sulle differenze (es. differenze ampie tra i metodi di balancing o i metodi di dim reduction)

# TODO USE F1 FOR METRIC IN CROSS VALIDATION