In [57]:
import pandas as pd
import numpy as np 

datasets = ['credit', 'adult']

from sklearn.tree import DecisionTreeClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import DecisionListClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


clfs = {
    "CART": DecisionTreeClassifier(random_state=1234),
    "EBM": ExplainableBoostingClassifier(),
    "LR_l2": LogisticRegression(penalty="l2",random_state=1234),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(penalty="none", random_state=1234),
    "DL": DecisionListClassifier(random_state=1234) 
}

In [58]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss
n_datasets = len(datasets)
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=1234, shuffle=True)
subsample = StratifiedShuffleSplit(n_splits = 8, train_size= 0.5, test_size=0.5, random_state=1234)

variances = np.zeros((n_datasets, n_splits))
biases = np.zeros((n_datasets, n_splits))


In [59]:
from sklearn.base import clone 
from sklearn import metrics
import pandas as pd

import helper
import importlib
importlib.reload(helper)
from sklearn.pipeline import make_pipeline

CLF_NAME = "CART"

for data_id, dataset in enumerate(datasets):
    X=pd.read_csv(f"datasets/cleaned/{dataset}_X.csv")
    X = X.drop("Unnamed: 0", axis=1)
    y = pd.read_csv(f"datasets/cleaned/{dataset}_y.csv")
    y = y.drop("Unnamed: 0", axis=1)

    features_types_df = pd.read_csv(f"datasets/cleaned/datatypes/{dataset}.csv")

    feature_inidices = list(map(int, list(features_types_df)))
    features_names = list(features_types_df.T[0])
    features_types = list(map(int, list(features_types_df.T[1])))

    preprocess = helper.select_preprocessing_for_many_feat(feature_inidices, features_types, features_names)

    subset_results=[]
    subset_acc=[]
    ground_true_labels=[]

    for fold_id, (train, test) in enumerate(skf.split(X, y)):
        subset_results=[]
        subset_acc=[]
        ground_true_labels=[]

        X_train = X.iloc[train]
        X_test = X.iloc[test]

        y_train = y.iloc[train]
        y_test = y.iloc[test]

        for sub_id, (train_idx, test_idx) in enumerate(subsample.split(X_train, y_train)):
            X_sub_train, y_sub_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
            
            clf = clone(clfs[CLF_NAME])
            clf_pipeline = make_pipeline(
                preprocess,
                clf
            )

            clf_pipeline.fit(X_sub_train, y_sub_train)
            clf_preds = clf_pipeline.predict(X_test)
            subset_results.append(clf_preds)
            ground_true_labels.append(y_test)

        variance = np.mean(np.var(np.array(subset_results), axis=0))
        avg_test_y_pred = np.mean(np.array(subset_results), axis=0)
        bias = np.mean((avg_test_y_pred - ground_true_labels) ** 2)

        biases[data_id, fold_id] = bias
        variances[data_id, fold_id] = variance
                

In [60]:
columns_names = ["dataset_name","fold_id", "bias", "variance"]

In [61]:
biases

array([[0.40024535, 0.44331095, 0.41754907, 0.41057692, 0.4185466 ,
        0.41159393, 0.41909393, 0.41214127, 0.39938979, 0.40073595],
       [0.29126763, 0.29027597, 0.28666747, 0.29105858, 0.28512932,
        0.29115119, 0.28860685, 0.29316885, 0.29068337, 0.29600401]])

In [62]:
variances

array([[0.09303977, 0.05255682, 0.07883523, 0.08173077, 0.07716346,
        0.08293269, 0.08774038, 0.078125  , 0.09543269, 0.08870192],
       [0.07824977, 0.07766911, 0.07898879, 0.07662776, 0.08072597,
        0.07846572, 0.07639262, 0.07839853, 0.07704046, 0.08021793]])

In [63]:
results_df = pd.DataFrame(columns=columns_names)
# results_df["fold_id"] = np.arange(10)

In [71]:
pd.DataFrame(variances).T

Unnamed: 0,0,1
0,0.09304,0.07825
1,0.052557,0.077669
2,0.078835,0.078989
3,0.081731,0.076628
4,0.077163,0.080726
5,0.082933,0.078466
6,0.08774,0.076393
7,0.078125,0.078399
8,0.095433,0.07704
9,0.088702,0.080218


In [89]:
temp_dfs = []

In [90]:
for i in range(len(datasets)):
    temp_df = pd.DataFrame(columns=columns_names)
    temp_df["fold_id"] = np.arange(10)
    temp_df["dataset_name"] = datasets[i]
    temp_df["bias"]=biases[i]
    temp_df["variance"]=variances[i]
    temp_dfs.append(temp_df)
    

In [91]:
results = pd.concat(temp_dfs)

In [93]:
results.to_csv(f"./test_results/bias_and_variance/{CLF_NAME}",index=False)

In [92]:
results

Unnamed: 0,dataset_name,fold_id,bias,variance
0,credit,0,0.400245,0.09304
1,credit,1,0.443311,0.052557
2,credit,2,0.417549,0.078835
3,credit,3,0.410577,0.081731
4,credit,4,0.418547,0.077163
5,credit,5,0.411594,0.082933
6,credit,6,0.419094,0.08774
7,credit,7,0.412141,0.078125
8,credit,8,0.39939,0.095433
9,credit,9,0.400736,0.088702
