In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

#Support vector machines 
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier


In [24]:
clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "RNF": RandomForestClassifier(random_state=1234),
    "XGB": XGBClassifier(use_label_encoder=False),
    "CAT": CatBoostClassifier(random_state=1234),
    "ADA": AdaBoostClassifier(SVC(random_state=1234, kernel='rbf', probability=True)),
    "BAG": BaggingClassifier(SVC(random_state=1234, kernel='rbf', probability=True))
}

In [25]:
datasets = ['breast', 'campus', 'churn', 'climate',
            'compas', 'diabetes', 'german', 'heart',
            'adult', 'student', 'bank', 'credit']

In [26]:
from sklearn.metrics import recall_score, precision_score, accuracy_score,f1_score, auc, roc_curve
metrics_dict = {
    "recall": recall_score,
    'precision': precision_score,
    'accuracy': accuracy_score,
    'f1': f1_score,
    'auc': auc
}

In [27]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
n_datasets = len(datasets)
n_splits = 10
# repeats 5, splits 2
skf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)

scores = np.zeros((len(clfs), n_datasets, n_splits, len(metrics_dict)))
# loss = np.zeros((len(clfs), n_datasets, n_splits))

from sklearn.base import clone 
from sklearn import metrics
import pandas as pd

import helper
import importlib
importlib.reload(helper)
from sklearn.pipeline import make_pipeline

for data_id, dataset in enumerate(datasets):
    X=pd.read_csv(f"../datasets/cleaned/{dataset}_X.csv")
    X = X.drop("Unnamed: 0", axis=1)
    y = pd.read_csv(f"../datasets/cleaned/{dataset}_y.csv")
    y = y.drop("Unnamed: 0", axis=1)

    features_types_df = pd.read_csv(f"../datasets/cleaned/datatypes/{dataset}.csv")

    feature_inidices = list(map(int, list(features_types_df)))
    features_names = list(features_types_df.T[0])
    features_types = list(map(int, list(features_types_df.T[1])))

    preprocess = helper.select_preprocessing_for_many_feat(feature_inidices, features_types, features_names)

    for fold_id, (train, test) in enumerate(skf.split(X, y)):
        for clf_id, clf_name in enumerate(clfs):
            clf = clone(clfs[clf_name])
            clf_pipeline = make_pipeline(
                preprocess,
                clf
            )
                
            clf_pipeline.fit(X.iloc[train], y.iloc[train])
            y_preds = clf_pipeline.predict(X.iloc[test])

            for metric_id, metric in enumerate(metrics_dict):
                if metric_id == 4:
                    fpr, tpr, thresholds = metrics.roc_curve(y.iloc[test], y_preds)
                    scores[clf_id, data_id, fold_id, metric_id] = metrics.auc(fpr, tpr)
                else:
                    scores[clf_id, data_id, fold_id, metric_id] = metrics_dict[metric](y.iloc[test], y_preds)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.007741
0:	learn: 0.6808697	total: 28ms	remaining: 27.9s
1:	learn: 0.6681045	total: 88.7ms	remaining: 44.3s
2:	learn: 0.6553108	total: 108ms	remaining: 36s
3:	learn: 0.6429219	total: 127ms	remaining: 31.6s
4:	learn: 0.6308460	total: 147ms	remaining: 29.3s
5:	learn: 0.6182754	total: 174ms	remaining: 28.8s
6:	learn: 0.6081016	total: 224ms	remaining: 31.8s
7:	learn: 0.5970539	total: 243ms	remaining: 30.1s
8:	learn: 0.5874324	total: 267ms	remaining: 29.4s
9:	learn: 0.5762222	total: 298ms	remaining: 29.5s
10:	learn: 0.5668718	total: 324ms	remaining: 29.1s
11:	learn: 0.5583171	total: 358ms	remaining: 29.4s
12:	learn: 0.5492759	total: 395ms	remaining: 30s
13:	learn: 0.5385943	total: 422ms	remaining: 29.8s
14:	learn: 0.5286511	total: 449ms	remaining: 29.5s
15:	learn: 0.5198395	total: 482ms	remaining: 29.6s
16:	learn: 0.5108524	total: 525ms	remaining: 30.3s
17:	learn: 0.5011868	total: 565ms	remaining: 30.8s
18:	learn: 0.4941985	total: 608ms	remaining: 31.4s
19:	learn: 0.48

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.007741
0:	learn: 0.6815539	total: 53.8ms	remaining: 53.8s
1:	learn: 0.6694947	total: 92.2ms	remaining: 46s
2:	learn: 0.6584197	total: 130ms	remaining: 43.2s
3:	learn: 0.6454368	total: 145ms	remaining: 36.1s
4:	learn: 0.6328587	total: 164ms	remaining: 32.6s
5:	learn: 0.6199022	total: 192ms	remaining: 31.7s
6:	learn: 0.6098288	total: 225ms	remaining: 31.9s
7:	learn: 0.5978907	total: 248ms	remaining: 30.8s
8:	learn: 0.5873430	total: 274ms	remaining: 30.2s
9:	learn: 0.5755447	total: 299ms	remaining: 29.6s
10:	learn: 0.5658440	total: 335ms	remaining: 30.1s
11:	learn: 0.5569551	total: 358ms	remaining: 29.5s
12:	learn: 0.5482539	total: 389ms	remaining: 29.5s
13:	learn: 0.5375569	total: 413ms	remaining: 29.1s
14:	learn: 0.5287734	total: 433ms	remaining: 28.4s
15:	learn: 0.5180884	total: 449ms	remaining: 27.6s
16:	learn: 0.5087489	total: 474ms	remaining: 27.4s
17:	learn: 0.4984659	total: 493ms	remaining: 26.9s
18:	learn: 0.4897391	total: 524ms	remaining: 27.1s
19:	learn: 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.007741
0:	learn: 0.6795514	total: 46.3ms	remaining: 46.2s
1:	learn: 0.6665424	total: 111ms	remaining: 55.2s
2:	learn: 0.6540544	total: 138ms	remaining: 45.9s
3:	learn: 0.6410936	total: 172ms	remaining: 42.8s
4:	learn: 0.6296282	total: 184ms	remaining: 36.7s
5:	learn: 0.6181661	total: 228ms	remaining: 37.8s
6:	learn: 0.6075235	total: 247ms	remaining: 35s
7:	learn: 0.5955156	total: 264ms	remaining: 32.7s
8:	learn: 0.5863630	total: 296ms	remaining: 32.5s
9:	learn: 0.5755633	total: 315ms	remaining: 31.2s
10:	learn: 0.5660503	total: 350ms	remaining: 31.4s
11:	learn: 0.5572107	total: 393ms	remaining: 32.3s
12:	learn: 0.5490390	total: 407ms	remaining: 30.9s
13:	learn: 0.5385579	total: 451ms	remaining: 31.8s
14:	learn: 0.5291444	total: 480ms	remaining: 31.5s
15:	learn: 0.5197806	total: 525ms	remaining: 32.3s
16:	learn: 0.5106571	total: 555ms	remaining: 32.1s
17:	learn: 0.5009318	total: 600ms	remaining: 32.7s
18:	learn: 0.4929708	total: 613ms	remaining: 31.6s
19:	learn: 0