### Packages

In [None]:
#requirements !pip freeze > requirements.txt.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Load and Organise data

## Import data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/askoshiyama/mli-cohort/master/german_credit.csv")
df.head()

## Pre-process variables

In [None]:
# variable types
df.dtypes

In [None]:
# categorical variables
cat_variables = ['account_check_status', 'credit_history', 'purpose', 'savings', 'present_emp_since', 'personal_status_sex',
                'property', 'other_installment_plans', 'housing', 'job', 'telephone', "other_debtors", 'foreign_worker']

# target variable
output_variable = ["default"]

# other integer variables
int_variables = ['credits_this_bank', 'present_res_since', 'duration_in_month', 'people_under_maintenance', 
                 'installment_as_income_perc', 'age', 'credit_amount']
# list(set(df.columns) - set(output_variable) - set(cat_variables))

### Mapping categorical variables to one-hot encoding

In [None]:
animals = ["cat", "dog", "cat", "dog"]
# one-hot encoding
animals_cat = [1, 0, 1, 0]
animals_dog = [0, 1, 0, 1]
animals_not_cat = [0, 1, 0, 1] # 1 - animals_cat

# integer encoding
animals_int = [1, 2, 1, 2]

# binary encoding
animals = ["cat", "dog", "cat", "dog", "rat", "bat"] # 4 categories, log_2 (4) = 2
animals_0 = [0,     0,    0,      0,     1,     1]
animals_1 = [0,     1,    0,      1,     0,     1]

In [None]:
# pre-allocation
df_cat = pd.DataFrame(index=df.index)

# one-hot encoding of categorical variables
from sklearn.preprocessing import OneHotEncoder

# I will do a loop for pedagogical reasons, but it is not entirely necessary
for cat in cat_variables:
    # one-hot encoding fitting
    one_hot_func = OneHotEncoder().fit(df[[cat]])
    
    # mapping
    cat_mapped = one_hot_func.transform(df[[cat]]).toarray()
    
    # storing
    for (k, cat_label) in enumerate(one_hot_func.categories_[0]):
        df_cat[cat + "_" + cat_label] = cat_mapped[:, k]

# quick check
df_cat.mean()

In [None]:
# bracketing integer variable - age
from sklearn.preprocessing import KBinsDiscretizer
kbin_func = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='quantile').fit(df[["age"]])
df_age = pd.DataFrame(kbin_func.transform(df[["age"]]).toarray(), columns=["young", "adult", "senior"])

# checking
pd.concat([df_age, df[["age"]]], axis=1).head(10)

In [None]:
# other types of transformations possible - log transformation 
from sklearn.preprocessing import FunctionTransformer
log_mapping = FunctionTransformer(func=np.log, inverse_func=np.exp)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.hist(df["credit_amount"])
ax1.set_title("Pre-transformation")

ax2.hist(log_mapping.transform(df["credit_amount"]))
ax2.set_title("After log-transformation")
df_log = pd.DataFrame(log_mapping.transform(df[["credit_amount"]]).values, columns=["log(credit)"])

## Consolidating a final dataset

In [None]:
df_model = pd.concat([df[int_variables], df_cat, df_age, df_log, df[output_variable]], axis=1)
df.shape, df_model.shape

In [None]:
df_model.describe()

# Modelling

## Quick modelling with Tree-based Model

In [None]:
df_model[output_variable].describe()

In [None]:
# separate data
X, y = df_model.drop(labels=output_variable, axis=1), df_model[output_variable]

# fit model
from sklearn.tree import DecisionTreeClassifier
#class_tree = DecisionTreeClassifier(max_depth=2, criterion="gini")
class_tree = DecisionTreeClassifier(max_depth=5, criterion="gini", class_weight="balanced")
class_tree.fit(X, y)

### Getting node variables + Plotting Decision Tree

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(class_tree, out_file="class_tree.dot", feature_names=X.columns, 
                class_names = ["No Default", "Default"], rounded = True)

#### go to http://webgraphviz.com/ and paste class_tree.dot text

### Some exploratory analysis

In [None]:
pd.crosstab(X["account_check_status_no checking account"] <=0.5, y[output_variable[0]]==1)

In [None]:
nodefault_duration = X.loc[((y.values == 0) & 
                            (X["account_check_status_no checking account"].values.reshape(-1, 1) <= 0.5)).reshape(1, -1)[0], 
                           "duration_in_month"]
default_duration = X.loc[((y.values == 1) & 
                          (X["account_check_status_no checking account"].values.reshape(-1, 1) <= 0.5)).reshape(1, -1)[0], 
                         "duration_in_month"]
ax = sns.kdeplot(nodefault_duration, shade=True, shade_lowest=False, label="No Default")
ax = sns.kdeplot(default_duration, shade=True, shade_lowest=False, label="Default")
ax.set_xlabel("duration_in_month")

## Cross-validating Tree-Based Model

### Initial Params

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
#from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 10-fold cv
k_folds = StratifiedKFold(n_splits=10, random_state=10)

# performance metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
                "Brier": metrics.brier_score_loss
               }

# model
#model = DecisionTreeClassifier(max_depth=3, criterion="gini", random_state=10)
model = DecisionTreeClassifier(max_depth=2, criterion="gini", class_weight="balanced", random_state=10)

### Fit model

In [None]:
# pre-allocation
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values
df_metrics = pd.DataFrame(index=range(10), columns=perf_metrics.keys())

# main loop
k = 0
for (train, test) in k_folds.split(X, y):
    # fit model
    model.fit(X[train], y[train])
       
    # test model
    ypred_class = model.predict(X[test])
    ypred_prob = model.predict_proba(X[test])[:, 1]

    # compute performance metrics
    for pf in perf_metrics.keys():
        if pf in ["AUC", "Brier"]:
            df_metrics.loc[k, pf] = perf_metrics[pf](y[test], ypred_prob)
        else:
            df_metrics.loc[k, pf] = perf_metrics[pf](y[test], ypred_class)
    k += 1

### Show results

In [None]:
df_metrics.mean()

In [None]:
df_metrics.mean()

In [None]:
df_metrics.plot(kind="box")

## Hyper-parameter tuning + Cross-validation: GBT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

# params
k_folds = StratifiedKFold(n_splits=10, random_state=10) # k-fold
hyper_params = {"learning_rate": [10.0 ** -2, 0.5, 10.0 ** -1, 1.5, 10.0 ** 0],
                "max_depth": [1, 2, 3],
                "max_features": [0.1, 0.25, 0.5],
                "n_estimators": [50, 100]
                }

# performance metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
                "Brier": metrics.brier_score_loss}

for pf in perf_metrics:
    perf_metrics[pf] = metrics.make_scorer(perf_metrics[pf])

# main method
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values

model = GridSearchCV(estimator=GradientBoostingClassifier(random_state=10), 
                     param_grid=hyper_params, 
                     scoring=perf_metrics, 
                     cv=k_folds, 
                     refit="F1-Score")

model.fit(X, y.ravel())

### Show results

In [None]:
# get best model
best_model = model.best_estimator_
best_model

In [None]:
# store all results and avg results in a df
df_cv = pd.DataFrame(model.cv_results_)

# add a hyperparameter column in avg df
from sklearn.model_selection import ParameterGrid
df_hyperparams = pd.DataFrame(list(ParameterGrid(model.param_grid)))
df_avgcv = pd.DataFrame(columns=perf_metrics.keys())
for pf in perf_metrics.keys():
    df_avgcv[pf] = df_cv["mean_test_" + pf]
df_avgcv = pd.concat([df_hyperparams, df_avgcv], axis=1)

In [None]:
sns.pointplot(x="max_depth", y="F1-Score", data=df_avgcv)

In [None]:
df_avgcv.loc[df_avgcv["F1-Score"].idxmax()]

### Feature Importance chart

In [None]:
df_featimp = pd.DataFrame({"Variable": df_model.drop(output_variable, axis=1).columns,
                           "Importance": best_model.feature_importances_})
df_featimp = df_featimp.sort_values("Importance", ascending=False)

# chart
df_featimp.iloc[:5].plot(x="Variable", y="Importance", kind="barh")

### Partial dependency plot

In [None]:
from sklearn.inspection import plot_partial_dependence
fig, ax = plt.subplots(figsize=(10, 10))
plot_partial_dependence(best_model, X, df_featimp.iloc[:5].index, response_method="predict_proba", 
                        method="brute", grid_resolution=200, ax=ax, 
                        feature_names=df_featimp["Variable"])

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
plot_partial_dependence(best_model, X, [(df_featimp.index[0], df_featimp.index[1])], 
                        feature_names=df_featimp["Variable"], ax=ax, 
                        response_method="predict_proba", method="brute")

## Hyper-parameter tuning + Cross-validation: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

# params
k_folds = StratifiedKFold(n_splits=10, random_state=10) # k-fold
hyper_params = {"max_depth": [1, 3, 7, None],
                "max_features": [0.1, 0.25, 0.5],
                "n_estimators": [200]
                }

# performance metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
                "Brier": metrics.brier_score_loss}

for pf in perf_metrics:
    perf_metrics[pf] = metrics.make_scorer(perf_metrics[pf])

# main method
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values

model = GridSearchCV(estimator=RandomForestClassifier(random_state=10), 
                     param_grid=hyper_params, 
                     scoring=perf_metrics, 
                     cv=k_folds, 
                     refit="F1-Score")

model.fit(X, y.ravel())

### Show results

In [None]:
# get best model
best_model = model.best_estimator_
best_model

In [None]:
# store all results and avg results in a df
df_cv = pd.DataFrame(model.cv_results_)

# add a hyperparameter column in avg df
from sklearn.model_selection import ParameterGrid
df_hyperparams = pd.DataFrame(list(ParameterGrid(model.param_grid)))
df_avgcv = pd.DataFrame(columns=perf_metrics.keys())
for pf in perf_metrics.keys():
    df_avgcv[pf] = df_cv["mean_test_" + pf]
df_avgcv = pd.concat([df_hyperparams, df_avgcv], axis=1)

In [None]:
sns.pointplot(x="max_depth", y="F1-Score", data=df_avgcv)

In [None]:
df_avgcv.loc[df_avgcv["F1-Score"].idxmax()]

### Feature Importance chart

In [None]:
df_featimp = pd.DataFrame({"Variable": df_model.drop(output_variable, axis=1).columns,
                           "Importance": best_model.feature_importances_})
df_featimp = df_featimp.sort_values("Importance", ascending=False)

# chart
df_featimp.iloc[:5].plot(x="Variable", y="Importance", kind="barh")

### Partial dependency plot

In [None]:
from sklearn.inspection import plot_partial_dependence
fig, ax = plt.subplots(figsize=(10, 10))
plot_partial_dependence(best_model, X, df_featimp.iloc[:5].index, response_method="predict_proba", 
                        method="brute", grid_resolution=200, ax=ax, 
                        feature_names=df_featimp["Variable"])

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
plot_partial_dependence(best_model, X, [(df_featimp.index[0], df_featimp.index[1])], 
                        feature_names=df_featimp["Variable"], ax=ax, 
                        response_method="predict_proba", method="brute")

## Stacking Approaches

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html

# Other Topics

## Scalable models

### Create dataset with different sizes

In [None]:
# small
df_model_small = df_model.copy()

# medium
for k in range(10):
    if k==0:
        df_model_medium = df_model.copy()
    else:
        df_model_medium = pd.concat([df_model_medium, df_model.copy()], axis=0, ignore_index=True)
        
# large
for k in range(100):
    if k==0:
        df_model_large = df_model.copy()
    else:
        df_model_large = pd.concat([df_model_large, df_model.copy()], axis=0, ignore_index=True)

### GBT

In [None]:
from time import time
from sklearn.ensemble import GradientBoostingClassifier
# small
start = time()
GradientBoostingClassifier(n_estimators=100).fit(df_model_small.drop(output_variable, axis=1), 
                                                 df_model_small[output_variable])
print(time() - start)

# medium
start = time()
GradientBoostingClassifier(n_estimators=100).fit(df_model_medium.drop(output_variable, axis=1), 
                                                 df_model_medium[output_variable])
print(time() - start)

# large
start = time()
GradientBoostingClassifier(n_estimators=100).fit(df_model_large.drop(output_variable, axis=1), 
                                                 df_model_large[output_variable])
print(time() - start)

### LightGBM - https://lightgbm.readthedocs.io/en/latest/index.html

In [None]:
#!pip install lightgbm - install it first!

In [None]:
from lightgbm import LGBMClassifier
from time import time
from sklearn.ensemble import GradientBoostingClassifier
# small
start = time()
LGBMClassifier(n_estimators=100).fit(df_model_small.drop(output_variable, axis=1).values, 
                                                 df_model_small[output_variable].values)
print(time() - start)

# medium
start = time()
LGBMClassifier(n_estimators=100).fit(df_model_medium.drop(output_variable, axis=1).values, 
                                                 df_model_medium[output_variable].values)
print(time() - start)

# large
start = time()
LGBMClassifier(n_estimators=100).fit(df_model_large.drop(output_variable, axis=1).values, 
                                                 df_model_large[output_variable].values)
print(time() - start)