### Packages

In [None]:
#requirements !pip freeze > requirements.txt.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Load and Organise data

## Import data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/askoshiyama/mli-cohort/master/german_credit.csv")
df.head()

## Pre-process variables

In [None]:
# variable types
df.dtypes

In [None]:
# categorical variables
cat_variables = ['account_check_status', 'credit_history', 'purpose', 'savings', 'present_emp_since', 'personal_status_sex',
                'property', 'other_installment_plans', 'housing', 'job', 'telephone', "other_debtors", 'foreign_worker']

# target variable
output_variable = ["default"]

# other integer variables
int_variables = ['credits_this_bank', 'present_res_since', 'duration_in_month', 'people_under_maintenance', 
                 'installment_as_income_perc', 'age', 'credit_amount']
# list(set(df.columns) - set(output_variable) - set(cat_variables))

In [None]:
# one-hot encoding fitting
one_hot_func = OneHotEncoder().fit(df[cat_variables])

# mapping
cat_mapped = one_hot_func.transform(df[cat_variables]).toarray()

In [None]:
# pre-allocation
df_cat = pd.DataFrame(index=df.index)

# one-hot encoding of categorical variables
from sklearn.preprocessing import OneHotEncoder

# I will do a loop for pedagogical reasons, but it is not entirely necessary
for cat in cat_variables:
    # one-hot encoding fitting
    one_hot_func = OneHotEncoder().fit(df[[cat]])
    
    # mapping
    cat_mapped = one_hot_func.transform(df[[cat]]).toarray()
    
    # storing
    for (k, cat_label) in enumerate(one_hot_func.categories_[0]):
        df_cat[cat + "_" + cat_label] = cat_mapped[:, k]

# quick check
df_cat.mean()

In [None]:
# bracketing integer variable - age
from sklearn.preprocessing import KBinsDiscretizer
kbin_func = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='quantile').fit(df[["age"]])
df_age = pd.DataFrame(kbin_func.transform(df[["age"]]).toarray(), columns=["young", "adult", "senior"])

# checking
pd.concat([df_age, df[["age"]]], axis=1).head(10)

In [None]:
# other types of transformations possible - log transformation 
from sklearn.preprocessing import FunctionTransformer
log_mapping = FunctionTransformer(func=np.log, inverse_func=np.exp)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.hist(df["credit_amount"])
ax1.set_title("Pre-transformation")

ax2.hist(log_mapping.transform(df["credit_amount"]))
ax2.set_title("After log-transformation")
df_log = pd.DataFrame(log_mapping.transform(df["credit_amount"]).values, columns=["log(credit)"])

## Consolidating a final dataset

In [None]:
df_model = pd.concat([df[int_variables[:-2]], df_cat, df_age, df_log, df[output_variable]], axis=1)
df.shape, df_model.shape

In [None]:
df_model.head()

## Visualization: PCA, t-SNE

In [None]:
from sklearn.decomposition import PCA
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

plt.figure()
colors = ['navy', 'turquoise']
target_names = ["No Default", "Default"]
lw = 2

for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_r[(y == i).ravel(), 0], X_r[(y == i).ravel(), 1], color=color, alpha=.8, lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of German Credit dataset')

In [None]:
from sklearn.manifold import TSNE
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values
tsne = TSNE(n_components=2, init='random', random_state=0, perplexity=5)
X_r = tsne.fit_transform(X)

plt.figure()
colors = ['navy', 'turquoise']
target_names = ["No Default", "Default"]
lw = 2

for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_r[(y == i).ravel(), 0], X_r[(y == i).ravel(), 1], color=color, alpha=.8, lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('t-SNE of German Credit dataset')    

# Modelling

## Kernel SVM: Grid-search + Stratified K-fold-CV

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

# params
k_folds = StratifiedKFold(n_splits=3, random_state=10) # 3-fold -

hyper_params = {"kernel_svc__C": [10.00 ** 2, 10.00 ** 3, 10.00 ** 4], # 5
                "kernel_svc__gamma": (np.array([10.0 ** -2, 10.0 ** -1, 10.0 ** 0, 10.0 ** 1, 10.0 ** 2]) * 1.0/df_model.shape[1]).tolist(),
                # [10.0 ** -3, 10.0 ** -2, 10.0 ** -1, 10.0 ** 0, 10.0 ** 1] * 1.0/df_model.shape[1],
                "kernel_svc__kernel": ["rbf"], # rbf (radial basis function) a.k.a. gaussian kernel
                "kernel_svc__probability": [True]
                }

# = 75 models in total

# performance metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
                "Brier": metrics.brier_score_loss
               }
for pf in perf_metrics:
    perf_metrics[pf] = metrics.make_scorer(perf_metrics[pf])

# main method
# split the data into inputs and output
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values

# instantiate the model - Pipeline where we scale the variables using StandardScaler = (x - mu)/sigma; kernel SVM
model_pipeline = Pipeline([("scaler", StandardScaler()),
                           ("kernel_svc", SVC())]
                         )

# run the grid-search CV procedure
model = GridSearchCV(estimator=model_pipeline, 
                     param_grid=hyper_params, 
                     scoring=perf_metrics, 
                     cv=k_folds, 
                     refit="F1-Score"
                    )
model.fit(X, y.ravel())

In [None]:
# - done by GridSearchCV
# for every hyperparameter
## for every (train, test) fold
### model_pipeline.fit(X[train], y[train])
### model_pipelin.predict(X[test], y[test])
### compute performance metrics
### store results

### Show results

In [None]:
# get best model
best_model = model.best_estimator_
best_model

In [None]:
# store all results and avg results in a df
df_cv = pd.DataFrame(model.cv_results_)

# add a hyperparameter column in avg df
from sklearn.model_selection import ParameterGrid
df_hyperparams = pd.DataFrame(list(ParameterGrid(model.param_grid)))
df_avgcv = pd.DataFrame(columns=perf_metrics.keys())
for pf in perf_metrics.keys():
    df_avgcv[pf] = df_cv["mean_test_" + pf]
df_avgcv = pd.concat([df_hyperparams, df_avgcv], axis=1)

In [None]:
sns.pointplot(x="kernel_svc__C", y="F1-Score", data=df_avgcv)

In [None]:
sns.pointplot(x="kernel_svc__gamma", y="F1-Score", data=df_avgcv)

In [None]:
df_avgcv.loc[df_avgcv["F1-Score"].idxmax()]

### Feature Importance chart

In [None]:
from sklearn.inspection import permutation_importance

feat_imp = permutation_importance(best_model, X, y, n_repeats=5, random_state=10)

df_featimp = pd.DataFrame({"Variable": df_model.drop(output_variable, axis=1).columns,
                           "Importance": feat_imp['importances_mean']/feat_imp["importances_mean"].max()})

df_featimp = df_featimp.sort_values("Importance", ascending=False)

# chart
df_featimp.iloc[:5].plot(x="Variable", y="Importance", kind="barh")

### Partial dependence plot

In [None]:
from sklearn.inspection import plot_partial_dependence
z = pd.DataFrame(X, columns=df_model.drop(output_variable, axis=1).columns)
fig, ax = plt.subplots(figsize=(15, 10))
plot_partial_dependence(best_model, z, df_featimp["Variable"].iloc[:5], 
                        ax=ax, response_method="predict_proba", 
                        method="brute", grid_resolution=200
                       ) 

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
plot_partial_dependence(best_model, z, [(df_featimp["Variable"].iloc[0], df_featimp["Variable"].iloc[2])], 
                        fig=fig, response_method="predict_proba", method="brute")

In [None]:
from sklearn.manifold import TSNE
X, y = df_model.drop(labels=output_variable, axis=1).values, best_model.predict(X)
tsne = TSNE(n_components=2, init='random', random_state=0, perplexity=10)
X_r = tsne.fit_transform(X)

plt.figure()
colors = ['navy', 'turquoise']
target_names = ["No Default", "Default"]
lw = 2

for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_r[(y == i).ravel(), 0], X_r[(y == i).ravel(), 1], color=color, alpha=.8, lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('t-SNE of German Credit dataset - Predicted')    

In [None]:
from sklearn.manifold import TSNE
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values
tsne = TSNE(n_components=2, init='random', random_state=0, perplexity=10)
X_r = tsne.fit_transform(X)

plt.figure()
colors = ['navy', 'turquoise']
target_names = ["No Default", "Default"]
lw = 2

for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_r[(y == i).ravel(), 0], X_r[(y == i).ravel(), 1], color=color, alpha=.8, lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('t-SNE of German Credit dataset - Observed')    

## Reducing optimism bias in hyper-parameter optimization, and comparing strategies: Nested Cross-Validation/Resampling

### Initial Params

In [None]:
def MCC(y_true, y_pred):

    # confusion matrix components
    TP = sum((y_true.ravel()==1) * (y_pred.ravel()==1)) # catch the criminal
    TN = sum((y_true.ravel()==0) * (y_pred.ravel()==0)) # there was no criminal
    FP = sum((y_true.ravel()==0) * (y_pred.ravel()==1)) # false alarm
    FN = sum((y_true.ravel()==1) * (y_pred.ravel()==0)) # broken alarm
    
    # numerator
    Num = (TP * TN - FP * FN)
    
    # denominator
    Den = np.sqrt(TP + FP)*np.sqrt(TP + FN)*np.sqrt(TN + FP)*np.sqrt(TN + FN)
    
    return Num/Den # Num/Den

class KPI:
    def __init__(self, loan):
        self.loan = loan
    
    def KPI(self, y_true, y_pred):
        amount_repaid = (y_true.ravel()==1) * (y_pred.ravel()==1) * self.loan
        amount_loss = (y_true.ravel()==0) * (y_pred.ravel()==1) * self.loan
        return np.sum(amount_repaid - amount_loss)

In [None]:
kpi = KPI(np.exp(df_model["log(credit)"].values))

In [None]:
kpi.KPI((np.random.rand(1000) > 0.5) * 1, (np.random.rand(1000) > 0.5) * 1)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# k-fold params
inner_folds = StratifiedKFold(n_splits=3, random_state=10) # inner k-fold
outer_folds = StratifiedKFold(n_splits=2, random_state=10) # outer k-fold

# models
model_dict = {"GBT": GradientBoostingClassifier(random_state=10),
              "Tree": DecisionTreeClassifier(random_state=10),
              
              "MLP": Pipeline([("std", StandardScaler()), 
                               ("mlp", MLPClassifier(random_state=10))]),
              
              "KNN": Pipeline([("std", StandardScaler()),
                               ("knn", KNeighborsClassifier())]),
              
              "Logistic": LogisticRegression(penalty="none", solver="lbfgs"),
              "LassoLogistic": Pipeline([("std", StandardScaler()), 
                                         ("lasso", LogisticRegression(penalty="l1", solver='liblinear'))]),
              "ElasticNet": Pipeline([("std", StandardScaler()), 
                                      ("en", LogisticRegression(penalty="elasticnet", solver="saga"))]),
              "UnifRandom": DummyClassifier(strategy="uniform"), 
              "StratRandom": DummyClassifier(strategy="stratified"),
              "SVM": Pipeline([("std", StandardScaler()), 
                               ("svc", SVC(kernel="rbf", probability=True))]),
              #"Naive Bayes": GaussianNB()
             }

# models hyperparams
hyper_params = {"GBT": {"learning_rate": [10.0 ** -1, 10.0 ** 0, 2.0],
                        "max_depth": [1, 2, 3],
                        "max_features": [0.1, 0.25, 0.5],
                        "n_estimators": [200]},
                
                "Tree": {"max_depth": [1, 3, 5, 7, 9],
                         "class_weight": [None, "balanced"]},
                
                "MLP": {"mlp__hidden_layer_sizes": [(5,), (10,), (50,)], 
                        "mlp__activation": ['relu', 'tanh']
                       },
                "KNN": {"knn__n_neighbors": [1, 3, 5, 7]
                       },
                "Logistic": {"class_weight": [None, "balanced"]},
                
                "LassoLogistic": {"lasso__C": [10.0 ** -4, 10.0 ** -3, 10.0 ** -2, 10.0 ** -1, 10.0 ** 0],
                                  "lasso__class_weight": [None, "balanced"]},
                
                "ElasticNet": {"en__C": [10.0 ** -4, 10.0 ** -3, 10.0 ** -2, 10.0 ** -1, 10.0 ** -0],
                               "en__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
                               "en__class_weight": [None, "balanced"]},
                
                "UnifRandom": {},
                "StratRandom": {},
                
                "SVM": {"svc__C": [10.0 ** -2, 10.0 ** -1, 10.0 ** 0, 10.0 ** 1, 10.00 ** 2], 
                        "svc__gamma": (np.array([10.0 ** -2, 10.0 ** -1, 10.0 ** 0, 10.0 ** 1, 10.0 ** 2]) * 1.0/df_model.shape[1]).tolist(),
                        "svc__class_weight": [None, "balanced"]}
                #"Naive Bayes": {}
                }

# performance metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
                "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
                "Brier": metrics.brier_score_loss,
                "MCC": MCC,
                "KPI": KPI(np.exp(df_model["log(credit)"].values)).KPI
               }
scorer_metrics = {}
for pf in perf_metrics:
    scorer_metrics[pf] = metrics.make_scorer(perf_metrics[pf])

### Fitting models

In [None]:
# pre-allocation
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values
k, z, innercv_results, inner_best_model = 0, 0, {}, {}
df_outer_results = pd.DataFrame(index=[0], columns=["Model"] + list(perf_metrics.keys()))

# outer loop
for (out_train, out_test) in outer_folds.split(X, y):
    # separation: train, test
    Xtrain, Xtest = X[out_train], X[out_test]
    ytrain, ytest = y[out_train], y[out_test]
        
    # inner loop -- all models -- hyperparameter fine-tuning
    innercv_results["fold_" + str(k)] = {}
    inner_best_model["fold_" + str(k)] = {}
    
    for class_model in model_dict.keys():
        print((k, class_model))
        innercv_results["fold_" + str(k)][class_model] = GridSearchCV(estimator=model_dict[class_model], 
                                                                       param_grid=hyper_params[class_model], 
                                                                       scoring=scorer_metrics, 
                                                                       cv=inner_folds, 
                                                                       refit="MCC")
        innercv_results["fold_" + str(k)][class_model].fit(Xtrain, ytrain.ravel())
        inner_best_model["fold_" + str(k)][class_model] = innercv_results["fold_" + str(k)][class_model].best_estimator_
    
    # prediction -- all models -- compute performance metrics at the same level playing field
    for class_model in model_dict.keys():
        # prediction
        ypred_class = inner_best_model["fold_" + str(k)][class_model].predict(Xtest)
        ypred_prob = inner_best_model["fold_" + str(k)][class_model].predict_proba(Xtest)[:, 1]
        df_outer_results.loc[z, "Model"] = class_model
    
        # compute performance metrics
        for pf in perf_metrics.keys():
            if pf in ["AUC", "Brier"]:
                df_outer_results.loc[z, pf] = perf_metrics[pf](ytest, ypred_prob)
            else:
                df_outer_results.loc[z, pf] = perf_metrics[pf](ytest, ypred_class)
        z += 1
    
    # iteration
    k += 1

# final organisation
df_outer_results[list(perf_metrics.keys())] = df_outer_results[list(perf_metrics.keys())].astype(float)

In [None]:
df_outer_results[list(perf_metrics.keys())] = df_outer_results[list(perf_metrics.keys())].astype(float)

### Show results

In [None]:
df_outer_results.pivot_table(index=["Model"], aggfunc="mean")

In [None]:
df_outer_results.pivot_table(index=["Model"], aggfunc="mean").rank()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(13,5))
sns.pointplot(x="Model", y="MCC", data=df_outer_results, ax=ax)

In [None]:
df_melt = df_outer_results.melt(id_vars=["Model"], value_vars=list(perf_metrics.keys()))
fig, ax = plt.subplots(1,1,figsize=(13,5))
sns.pointplot(x="Model", y="value", hue="variable", data=df_melt, ax=ax)

In [None]:
df_outer_results.pivot_table(index=["Model"], aggfunc="mean")

In [None]:
a = df_outer_results.pivot_table(index=["Model"], aggfunc="mean")
a[["Brier"]] = 1 - a[["Brier"]]
a.rank()

In [None]:
a.rank().mean(axis=1)

# Other Topics

## Scalable models

### Create dataset with different sizes

In [None]:
# small
df_model_small = df_model.copy()

# medium
for k in range(10):
    if k==0:
        df_model_medium = df_model.copy()
    else:
        df_model_medium = pd.concat([df_model_medium, df_model.copy()], axis=0, ignore_index=True)
        
# large
for k in range(100):
    if k==0:
        df_model_large = df_model.copy()
    else:
        df_model_large = pd.concat([df_model_large, df_model.copy()], axis=0, ignore_index=True)

### GBT

In [None]:
from time import time
from sklearn.ensemble import GradientBoostingClassifier
# small
start = time()
GradientBoostingClassifier(n_estimators=100).fit(df_model_small.drop(output_variable, axis=1), 
                                                 df_model_small[output_variable])
print(time() - start)

# medium
start = time()
GradientBoostingClassifier(n_estimators=100).fit(df_model_medium.drop(output_variable, axis=1), 
                                                 df_model_medium[output_variable])
print(time() - start)

# large
start = time()
GradientBoostingClassifier(n_estimators=100).fit(df_model_large.drop(output_variable, axis=1), 
                                                 df_model_large[output_variable])
print(time() - start)

### LightGBM - https://lightgbm.readthedocs.io/en/latest/index.html

In [None]:
from lightgbm import LGBMClassifier
from time import time
from sklearn.ensemble import GradientBoostingClassifier
# small
start = time()
LGBMClassifier(n_estimators=100).fit(df_model_small.drop(output_variable, axis=1).values, 
                                                 df_model_small[output_variable].values)
print(time() - start)

# medium
start = time()
LGBMClassifier(n_estimators=100).fit(df_model_medium.drop(output_variable, axis=1).values, 
                                                 df_model_medium[output_variable].values)
print(time() - start)

# large
start = time()
LGBMClassifier(n_estimators=100).fit(df_model_large.drop(output_variable, axis=1).values, 
                                                 df_model_large[output_variable].values)
print(time() - start)

### (SGDClassifier, Linear SVM) + Feature Mapping - almost a kernel approach

#### Nystroem

In [None]:
from sklearn.kernel_approximation import Nystroem
nyst = Nystroem(gamma=0.001, n_components=2)
nyst.fit_transform(X)

In [None]:
X, y = df_model.drop(labels=output_variable, axis=1).values, df_model[output_variable].values
nyst = Nystroem(gamma=.01, n_components=2)
X_r = nyst.fit_transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

plt.figure()
colors = ['navy', 'turquoise']
target_names = ["No Default", "Default"]
lw = 2

for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_r[(y == i).ravel(), 0], X_r[(y == i).ravel(), 1], color=color, alpha=.8, lw=lw, label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of German Credit dataset')

#### Initial params

In [None]:
#### from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import Nystroem
from lightgbm import LGBMClassifier

# k-fold params
inner_folds = StratifiedKFold(n_splits=3, random_state=10) # inner k-fold
outer_folds = StratifiedKFold(n_splits=5, random_state=10) # outer k-fold

# models
model_dict = {"LightGBM": LGBMClassifier(random_state=10),
              "SGD": Pipeline([("std", StandardScaler()),
                               ("map", Nystroem(random_state=10)),
                               ("sgd", SGDClassifier(random_state=10))]),
              "LinearSVM": Pipeline([("std", StandardScaler()),
                                     ("map", Nystroem(random_state=10)),
                                     ("sgd", LinearSVC(dual=False, random_state=10))])
             }

# models hyperparams
hyper_params = {"LightGBM": {"learning_rate": [10.0 ** -1, 10.0 ** 0, 2.0],
                             "max_depth": [1, 2, 3],
                             #"max_features": [0.1, 0.25, 0.5],
                             "n_estimators": [200]},
                "SGD": {"sgd__loss": ["hinge", "log"],
                        "sgd__alpha": [0.01, 0.1, 1.0, 10.0],
                        "sgd__class_weight": [None, "balanced"],
                        "sgd__eta0": [0.001, 0.01, 0.1],
                        "sgd__max_iter": [100],
                        "map__gamma": [0.0001, 0.001, 0.01, 0.1]},
                "LinearSVM": {"sgd__C": [0.01, 0.1, 1.0, 10.0],
                              "sgd__class_weight": [None, "balanced"],
                              "map__gamma": [0.0001, 0.001, 0.01, 0.1]}
               }

# performance metrics
perf_metrics = {"Accuracy": metrics.accuracy_score, 
                "Precision": metrics.precision_score, 
                "Recall": metrics.recall_score,
               # "AUC": metrics.roc_auc_score, 
                "F1-Score": metrics.f1_score, 
               # "Brier": metrics.brier_score_loss
               }
scorer_metrics = {}
for pf in perf_metrics:
    scorer_metrics[pf] = metrics.make_scorer(perf_metrics[pf])

#### Fit models

In [None]:
# pre-allocation
X, y = df_model_medium.drop(labels=output_variable, axis=1).values, df_model_medium[output_variable].values
k, z, innercv_results, inner_best_model = 0, 0, {}, {}
df_outer_results = pd.DataFrame(index=[0], columns=["Model", "Time"] + list(perf_metrics.keys()))

# outer loop
for (out_train, out_test) in outer_folds.split(X, y):
    # separation: train, test
    Xtrain, Xtest = X[out_train], X[out_test]
    ytrain, ytest = y[out_train], y[out_test]
    
    # inner loop -- all models
    innercv_results["fold_" + str(k)] = {}
    inner_best_model["fold_" + str(k)] = {}
    start, end = [], []
    for class_model in model_dict.keys():
        start.append(time())
        print((k, class_model))
        innercv_results["fold_" + str(k)][class_model] = GridSearchCV(estimator=model_dict[class_model], 
                                                                       param_grid=hyper_params[class_model], 
                                                                       scoring=scorer_metrics, 
                                                                       cv=inner_folds, 
                                                                       refit="F1-Score")
        innercv_results["fold_" + str(k)][class_model].fit(Xtrain, ytrain.ravel())
        inner_best_model["fold_" + str(k)][class_model] = innercv_results["fold_" + str(k)][class_model].best_estimator_
        end.append(time())
    
    # outer loop prediction
    for (v, class_model) in enumerate(model_dict.keys()):
        # prediction
        ypred_class = inner_best_model["fold_" + str(k)][class_model].predict(Xtest)
        df_outer_results.loc[z, "Model"] = class_model
        df_outer_results.loc[z, "Time"] = end[v] - start[v]
    
        # compute performance metrics
        for pf in perf_metrics.keys():
            df_outer_results.loc[z, pf] = perf_metrics[pf](ytest, ypred_class)
        z += 1
    
    # iteration
    k += 1

# final organisation
df_outer_results[list(perf_metrics.keys()) + ["Time"]] = df_outer_results[list(perf_metrics.keys()) + ["Time"]].astype(float)

#### Show results

In [None]:
df_outer_results.pivot_table(index=["Model"], aggfunc="mean")

### Deep Gaussian Processes

#### https://gpytorch.ai/

## Other Hyperparameter optimizers

### Bayesian Optimization

#### https://scikit-optimize.github.io/#skopt.gp_minimize

### Tree of Parzen Estimators

#### https://github.com/hyperopt/hyperopt

### Evolutionary Strategies

#### https://deap.readthedocs.io/en/master/

## Explainable Models

### Explainable Boosting Machines

#### https://github.com/microsoft/interpret

### SHAP

#### https://github.com/slundberg/shap

### LIME

#### https://github.com/marcotcr/lime

### Captum

#### https://github.com/pytorch/captum?fbclid=IwAR0P_CvwX-dkUztoUBrcfFDygGGIHRt775pwik86npEUiKuUKudtx55kRDw

## Dealing with time series

### Cross-validation schemes

#### https://github.com/askoshiyama/tsdata-mvapi

### Scikit-learn for time series

#### https://github.com/alan-turing-institute/sktime

### Amazon GluonTS

#### https://github.com/awslabs/gluon-ts