In [1]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from functools import partial
from os.path import join as oj

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics, model_selection

import imodels
from imodels.util import data_util
from imodels.discretization import discretizer, simple

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
while os.getcwd().split('/')[-1] != 'imodels-experiments':
    os.chdir('..')

import viz
# from local_models.stable import StableLinearClassifier as stbl_local
# from experiments.util import get_comparison_result

np.random.seed(0)

In [2]:
class TransferTree:

    def __init__(self, trees, sample_weights):
        self.trees = trees
        self.sample_weights = sample_weights

    def fit(self, X, y):
        for tree, weight in zip(self.trees, self.sample_weights):
            tree.fit(X, y, weight)
        return self

    def predict(self, X, subgroups):
        preds = np.zeros(X.shape[0])
        for tree, subgroup in zip(self.trees, subgroups):
            preds[subgroup] = tree.predict(X[subgroup])
        return preds
    
    def predict_proba(self, X, subgroups):
        preds_proba = np.zeros((X.shape[0], 2))
        for tree, subgroup in zip(self.trees, subgroups):
            preds_proba[subgroup] = tree.predict_proba(X[subgroup])
        return preds_proba


def print_results(model, X_test, y_test, test_subgroups):
    # print('acc: ', metrics.accuracy_score(y_test, model.predict(X_test, test_subgroups)))
    print('APC: ', metrics.average_precision_score(y_test, model.predict_proba(X_test, test_subgroups)[:, 1]))
    print('AUC: ', metrics.roc_auc_score(y_test, model.predict_proba(X_test, test_subgroups)[:, 1]))

    y_test_0, y_test_1 = y_test[~test_subgroups[1]], y_test[test_subgroups[1]]
    X_test_0, X_test_1 = X_test[~test_subgroups[1]], X_test[test_subgroups[1]]

    # print('group 0 acc: ', metrics.accuracy_score(
    #     y_test_0, model.trees[0].predict(X_test_0)))
    print('group 0 APC: ', metrics.average_precision_score(
        y_test_0, model.trees[0].predict_proba(X_test_0)[:, 1]))
    print('group 0 AUC: ', metrics.roc_auc_score(
        y_test_0, model.trees[0].predict_proba(X_test_0)[:, 1]))
    
    # print('group 1 acc: ', metrics.accuracy_score(
    #     y_test_1, model.trees[0].predict(X_test_1)))
    print('group 1 APC: ', metrics.average_precision_score(
        y_test_1, model.trees[1].predict_proba(X_test_1)[:, 1]))
    print('group 1 AUC: ', metrics.roc_auc_score(
        y_test_1, model.trees[1].predict_proba(X_test_1)[:, 1]))


In [3]:
X, y, feature_names = data_util.get_clean_dataset('csi_with_meta_keys.csv', data_source='imodels')

In [4]:
X_df = pd.DataFrame(X, columns=feature_names)

In [5]:
cutoff = 4
max = X_df['AgeInYears'].max()

In [6]:
is_group_1 = X_df['AgeInYears'] > cutoff

In [7]:
p_group_1 = 0.5 / cutoff * X_df['AgeInYears']
p_group_1[is_group_1] = 0.5 / (max - cutoff) * (X_df.loc[is_group_1, 'AgeInYears'] - cutoff) + 0.5

In [8]:
X_df_clean = X_df.drop(columns=['SITE', 'AgeInYears'])

In [9]:
X, feature_names = X_df_clean.values, X_df_clean.columns.values
X_train, X_test, y_train, y_test, is_group_1_train, is_group_1_test, p_group_1_train, p_group_1_test = (
    model_selection.train_test_split(X, y, is_group_1, p_group_1, random_state=0))

### plain cart

In [10]:
cart = DecisionTreeClassifier(max_leaf_nodes=8)
cart.fit(X_train, y_train)

# print(metrics.accuracy_score(y_test, cart.predict(X_test)))
print('APC: ', metrics.average_precision_score(y_test, cart.predict_proba(X_test)[:, 1]))
print('AUC: ', metrics.roc_auc_score(y_test, cart.predict_proba(X_test)[:, 1]))

# print(metrics.accuracy_score(y_test[~is_group_1_test], cart.predict(X_test[~is_group_1_test])))
print('group 0 APC: ', metrics.average_precision_score(y_test[~is_group_1_test], cart.predict_proba(X_test[~is_group_1_test])[:, 1]))
print('group 0 AUC: ', metrics.roc_auc_score(y_test[~is_group_1_test], cart.predict_proba(X_test[~is_group_1_test])[:, 1]))

# print(metrics.accuracy_score(y_test[~is_group_1_test], cart.predict(X_test[~is_group_1_test])))
print('group 1 APC: ', metrics.average_precision_score(y_test[is_group_1_test], cart.predict_proba(X_test[is_group_1_test])[:, 1]))
print('group 1 AUC: ', metrics.roc_auc_score(y_test[is_group_1_test], cart.predict_proba(X_test[is_group_1_test])[:, 1]))

APC:  0.3426590940640236
AUC:  0.7394838447470027
group 0 APC:  0.4949816154749741
group 0 AUC:  0.8115828092243187
group 1 APC:  0.33472264208711
group 1 AUC:  0.7267743036168497


### two trees, no transfer

In [11]:
subcart_no_transfer = TransferTree(
    [DecisionTreeClassifier(max_leaf_nodes=8), DecisionTreeClassifier(max_leaf_nodes=8)],
    [(~is_group_1_train).astype(int), is_group_1_train.astype(int)])
subcart_no_transfer.fit(X_train, y_train)

test_subgroups = [~is_group_1_test, is_group_1_test]
print_results(subcart_no_transfer, X_test, y_test, test_subgroups)

APC:  0.3592480198628731
AUC:  0.7604597078281289
group 0 APC:  0.4500684676206498
group 0 AUC:  0.7971698113207547
group 1 APC:  0.3452409552595699
group 1 AUC:  0.7580572616167256


In [None]:
plot_tree(subcart_no_transfer.trees[0], feature_names=feature_names)

In [None]:
plot_tree(subcart_no_transfer.trees[1], feature_names=feature_names)

### linear transfer

In [13]:
transfer = TransferTree(
    [DecisionTreeClassifier(max_leaf_nodes=8), DecisionTreeClassifier(max_leaf_nodes=8)],
    [1 - p_group_1_train, p_group_1_train])
transfer.fit(X_train, y_train)

print_results(transfer, X_test, y_test, test_subgroups)

APC:  0.3593844150296934
AUC:  0.7524498182392919
group 0 APC:  0.5026392716146038
group 0 AUC:  0.8930817610062893
group 1 APC:  0.33135741102922966
group 1 AUC:  0.720570444816676


### linear one-way transfer (higher -> lower)

In [14]:
transfer = TransferTree(
    [DecisionTreeClassifier(max_leaf_nodes=8), DecisionTreeClassifier(max_leaf_nodes=8)],
    [1 - p_group_1_train, is_group_1_train.astype(int)])
transfer.fit(X_train, y_train)

print_results(transfer, X_test, y_test, test_subgroups)

APC:  0.3709575706959853
AUC:  0.7814412156517421
group 0 APC:  0.5026392716146038
group 0 AUC:  0.8930817610062893
group 1 APC:  0.3452409552595699
group 1 AUC:  0.7580572616167256


### sigmoidal transfer

In [15]:
p_group_1_sig = 1 / (1 + np.exp(-1 * (X_df['AgeInYears'] - 4)))
p_group_1_sig_train, p_group_1_sig_test = model_selection.train_test_split(p_group_1_sig, random_state=0)

In [16]:
transfer = TransferTree(
    [DecisionTreeClassifier(max_leaf_nodes=8), DecisionTreeClassifier(max_leaf_nodes=8)],
    [1 - p_group_1_sig_train, p_group_1_sig_train])
transfer.fit(X_train, y_train)

print_results(transfer, X_test, y_test, test_subgroups)

APC:  0.36890429583001605
AUC:  0.7509200930253563
group 0 APC:  0.49171896165234175
group 0 AUC:  0.8639937106918238
group 1 APC:  0.33135741102922966
group 1 AUC:  0.720570444816676


### sigmoidal one-way transfer (higher -> lower)\

In [17]:
transfer = TransferTree(
    [DecisionTreeClassifier(max_leaf_nodes=8), DecisionTreeClassifier(max_leaf_nodes=8)],
    [1 - p_group_1_sig_train, is_group_1_train.astype(int)])
transfer.fit(X_train, y_train)

print_results(transfer, X_test, y_test, test_subgroups)

APC:  0.38108448086858565
AUC:  0.7799002009528325
group 0 APC:  0.49171896165234175
group 0 AUC:  0.8639937106918238
group 1 APC:  0.3452409552595699
group 1 AUC:  0.7580572616167256


### step transfer

In [24]:
p_group_1_sig_train = is_group_1_train.astype(int) * 0.8
p_group_1_sig_train[p_group_1_sig_train == 0] = 0.2

In [25]:
transfer = TransferTree(
    [DecisionTreeClassifier(max_leaf_nodes=8), DecisionTreeClassifier(max_leaf_nodes=8)],
    [1 - p_group_1_sig_train, p_group_1_sig_train])
transfer.fit(X_train, y_train)

print_results(transfer, X_test, y_test, test_subgroups)

APC:  0.3775666062642785
AUC:  0.7815484657589922
group 0 APC:  0.5635977482119165
group 0 AUC:  0.894916142557652
group 1 APC:  0.3452409552595699
group 1 AUC:  0.7580572616167256


In [233]:
# fpr, tpr, thres = metrics.roc_curve(y_test, subcart_no_transfer.predict_proba(X_test, test_subgroups)[:, 1])

# plt.plot(fpr, tpr)

In [None]:
# remove meta keys, split into train and test and fit. 

# try regular cart on the whole thing, subgroup cart w/o transfer, and subgroup cart with transfer

# try linear vs sigmoidal transfer function

# site split