In [1]:
from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234) 
}

In [2]:
datasets = ['breast', 'campus', 'churn', 'climate',
            'compas', 'diabetes', 'german', 'heart',
            'stroke', 'student', 'water', 'credit']

In [3]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
n_datasets = len(datasets)
n_splits = 10
# repeats 5, splits 2
skf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)

auc_scores = np.zeros((len(clfs)+1, n_datasets, n_splits))
loss = np.zeros((len(clfs)+1, n_datasets, n_splits))

In [4]:
from sklearn.base import clone 
from sklearn import metrics
import pandas as pd

import helper
import importlib
importlib.reload(helper)
from sklearn.pipeline import make_pipeline

for data_id, dataset in enumerate(datasets):
    X=pd.read_csv(f"datasets/cleaned/{dataset}_X.csv")
    X = X.drop("Unnamed: 0", axis=1)
    y = pd.read_csv(f"datasets/cleaned/{dataset}_y.csv")
    y = y.drop("Unnamed: 0", axis=1)

    features_types_df = pd.read_csv(f"datasets/cleaned/datatypes/{dataset}.csv")

    feature_inidices = list(map(int, list(features_types_df)))
    features_names = list(features_types_df.T[0])
    features_types = list(map(int, list(features_types_df.T[1])))

    preprocess = helper.select_preprocessing_for_many_feat(feature_inidices, features_types, features_names)

    for fold_id, (train, test) in enumerate(skf.split(X, y)):
        for clf_id, clf_name in enumerate(clfs):
            clf = clone(clfs[clf_name])
            clf_pipeline = make_pipeline(
                preprocess,
                clf
            )
                
            clf_pipeline.fit(X.iloc[train], y.iloc[train])
            y_preds = clf_pipeline.predict(X.iloc[test])
            fpr, tpr, thresholds = metrics.roc_curve(y.iloc[test], y_preds)
            auc_scores[clf_id, data_id, fold_id] = metrics.auc(fpr, tpr)
            loss[clf_id, data_id, fold_id] = log_loss(y.iloc[test], y_preds)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-r

In [5]:
from sklearn.base import clone 
from sklearn import metrics
import pandas as pd

import helper
import importlib
importlib.reload(helper)
from sklearn.pipeline import make_pipeline

for data_id, dataset in enumerate(datasets):
    X=pd.read_csv(f"datasets/cleaned/{dataset}_X.csv")
    X = X.drop("Unnamed: 0", axis=1)
    y = pd.read_csv(f"datasets/cleaned/{dataset}_y.csv")
    y = y.drop("Unnamed: 0", axis=1)

    features_types_df = pd.read_csv(f"datasets/cleaned/datatypes/{dataset}.csv")

    feature_inidices = list(map(int, list(features_types_df)))
    features_names = list(features_types_df.T[0])
    features_types = list(map(int, list(features_types_df.T[1])))

    preprocess = helper.select_preprocessing_for_many_feat(feature_inidices, features_types, features_names)

    for fold_id, (train, test) in enumerate(skf.split(X, y)):

        clf_pipeline = make_pipeline(
                preprocess,
                DecisionListClassifier(random_state=1234)
            )
                
        clf_pipeline.fit(X.iloc[train], y.iloc[train])
        y_preds = clf_pipeline.predict(X.iloc[test])
        fpr, tpr, thresholds = metrics.roc_curve(y.iloc[test], y_preds)
        auc_scores[5, data_id, fold_id] = metrics.auc(fpr, tpr)
        loss[5, data_id, fold_id] = log_loss(y.iloc[test], y_preds)
        

In [6]:
auc_scores

array([[[0.91168831, 0.91168831, 0.90079365, 0.92063492, 0.88690476,
         0.91071429, 0.91071429, 0.91468254, 0.9484127 , 0.94285714],
        [0.54285714, 0.68571429, 0.86190476, 0.79047619, 0.75238095,
         0.96666667, 0.73333333, 0.81666667, 0.96428571, 0.78571429],
        [0.691275  , 0.67292344, 0.70296335, 0.68090452, 0.69384915,
         0.67493103, 0.70365307, 0.70448294, 0.67155775, 0.70408737],
        [0.77959184, 0.57959184, 0.66938776, 0.64897959, 0.58979592,
         0.48979592, 0.71      , 0.72      , 0.98      , 0.71      ],
        [0.56496119, 0.55490842, 0.52799405, 0.54740669, 0.54061713,
         0.54743531, 0.5655885 , 0.58026677, 0.59086329, 0.57845775],
        [0.75185185, 0.63777778, 0.72777778, 0.67777778, 0.69333333,
         0.75037037, 0.66925926, 0.61777778, 0.55153846, 0.70769231],
        [0.71904762, 0.60238095, 0.63333333, 0.6047619 , 0.65238095,
         0.66904762, 0.53571429, 0.66190476, 0.5047619 , 0.61521739],
        [0.63333333, 0.7571

In [7]:
auc_scores[0,:, 0]

array([0.91168831, 0.54285714, 0.691275  , 0.77959184, 0.56496119,
       0.75185185, 0.71904762, 0.63333333, 0.62264438, 0.56818182,
       0.53875   , 0.75      ])

In [8]:
auc_scores[1,:, 0]

array([0.95454545, 0.64761905, 0.72801508, 0.7       , 0.54445673,
       0.75481481, 0.63571429, 0.8247619 , 0.5       , 0.71464646,
       0.60046875, 0.90555556])

In [9]:
auc_scores[2,:, 0]

array([0.97727273, 0.75238095, 0.58340723, 0.9       , 0.54518048,
       0.76481481, 0.65238095, 0.81714286, 0.49893617, 0.71969697,
       0.5       , 0.89722222])

In [10]:
auc_scores[3,:, 0]

array([0.93181818, 0.71904762, 0.65656715, 0.8       , 0.60326316,
       0.7262963 , 0.6452381 , 0.78095238, 0.64812563, 0.6540404 ,
       0.55109375, 0.63611111])

In [11]:
auc_scores[4,:, 0]

array([0.94025974, 0.79047619, 0.58830919, 0.9       , 0.54518048,
       0.75481481, 0.66904762, 0.81714286, 0.49893617, 0.71969697,
       0.5       , 0.89444444])

In [12]:
np.save('./test_results/auc/auc_results', auc_scores)
np.save('./test_results/auc/auc_losses', loss)