Fit interpretable models to the training set and test on validation sets.

In [39]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import pickle as pkl
from os.path import join as oj
    
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree

import imodels
from rulevetting.api import validation, util as api_util
from rulevetting.projects.csi_pecarn.dataset import Dataset
from rulevetting import DATA_PATH

MODELS_DIR = './models'
os.makedirs(MODELS_DIR, exist_ok=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
df_train, df_tune, df_test = Dataset().get_data()
outcome_def = 'outcome'  # output
meta_keys = api_util.get_feat_names_from_base_feats(df_train.columns, Dataset().get_meta_keys())

# Keep age and site for transfer-trees project
meta_keys.remove('AgeInYears')
meta_keys.remove('SITE')

X_train = df_train.drop(columns=meta_keys)
X_tune = df_tune.drop(columns=meta_keys)
X_test = df_test.drop(columns=meta_keys)

pd.concat((X_train, X_tune, X_test)).astype(np.float32).to_csv(oj(DATA_PATH, 'imodels_data/csi_pred.csv'), index=False)

kwargs {'clean_data': {'include_intervention': False, 'fillna': True}, 'preprocess_data': {'unclear_feat_default': 0, 'only_site_data': 2, 'augmented_features': True, 'use_control_type': 'all', 'fillna': True}, 'extract_features': {'drop_negative_columns': False}}


In [None]:
y_train = X_train[outcome_def].values
X_train = X_train.drop(columns=[outcome_def])
y_tune = X_tune[outcome_def].values
X_tune = X_tune.drop(columns=[outcome_def])
y_test = X_test[outcome_def].values
X_test = X_test.drop(columns=[outcome_def])
processed_feats = X_train.keys().values.tolist()
feature_names = processed_feats

def predict_and_save(model, model_name='decision_tree'):
    '''Plots cv and returns cv, saves all stats
    '''
    results = {'model': model}
    for x, y, suffix in zip([X_train, X_tune],
                            [y_train, y_tune],
                            ['_train', '_tune']):
        stats, threshes = validation.all_stats_curve(y, model.predict_proba(x.values)[:, 1],
                                                     plot=suffix == '_tune')
        for stat in stats.keys():
            results[stat + suffix] = stats[stat]
        results['threshes' + suffix] = threshes
    pkl.dump(results, open(oj(MODELS_DIR, model_name + '.pkl'), 'wb'))
    return stats, threshes

def simple_report(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    print("Specificity: ", specificity)
    print("Sensitivity: ", sensitivity)

# fit simple models

**decision tree**

In [None]:
# fit decition tree
dt = DecisionTreeClassifier(max_depth=5, class_weight={0: 1, 1: 1000})
dt.fit(X_train, y_train)
stats, threshes = predict_and_save(dt, model_name='decision_tree')
# plt.xlim((0.8, 1.0))
# plt.ylim((0.5, 1.0))
plt.show()

fig = plt.figure(figsize=(50, 40))
# plot_tree(dt, feature_names=feature_names, filled=True)
plt.show()

## grl

In [None]:
class_weight = {0: 1, 1: 1000}
grl = imodels.GreedyRuleListClassifier(max_depth=10, class_weight=class_weight, criterion='neg_corr')
grl.fit(X_train, y_train, feature_names=feature_names)
stats, threshes = predict_and_save(grl, model_name='grl')

In [None]:
grl.rules_

**rulefit**

In [None]:
# fit a rulefit model
np.random.seed(13)
rulefit = imodels.RuleFitClassifier(alpha=10, max_rules=None, random_state=0, tree_size=4, n_estimators=5, include_linear=True)
rulefit.fit(X_train, y_train, feature_names=feature_names)

# preds = rulefit.predict(X_test)
stats, threshes = predict_and_save(rulefit, model_name='rulefit')
'''
def print_best(sens, spec):
    idxs = np.array(sens) > 0.9
    print(np.array(sens)[idxs], np.array(spec)[idxs])
print_best(sens, spec)
'''

In [None]:
# pd.reset_option('display.max_colwidth')
rulefit.visualize()

In [None]:
rulefit.complexity_

### original CDR

In [None]:
def baseline_cdr_predict(X, use_2 = False):
    num_conditions = (X['AlteredMentalStatus2'] + 
                      X['FocalNeuroFindings2'] + 
                      X['PainNeck2'] +
                      X['Torticollis2'] + 
                      X['subinj_TorsoTrunk2'] + 
                      X['Predisposed'] + 
                      X['HighriskDiving'] + 
                      X['HighriskMVC'])
    preds = (num_conditions > 0).astype(int).values
    return preds

In [None]:
X_all_train = pd.concat((X_train, X_tune))
y_all_train = np.concatenate((y_train, y_tune), axis=0)

In [None]:
simple_report(y_all_train, baseline_cdr_predict(X_all_train))

In [None]:
simple_report(y_train, baseline_cdr_predict(X_train))

In [None]:
simple_report(y_tune, baseline_cdr_predict(X_tune))

In [None]:
simple_report(y_test, baseline_cdr_predict(X_test))

## skope

In [None]:
skope = imodels.SkopeRulesClassifier(precision_min=0.01, recall_min=0.1, n_estimators=10, max_samples=0.8, bootstrap=True, max_depth=3, random_state=0)
skope.fit(X_train, y_train)
stats, threshes = predict_and_save(skope, model_name='skope')

In [None]:
len(skope.rules_)

In [None]:
boost = imodels.BoostedRulesClassifier(n_estimators=10,)
boost.fit(X_train, y_train)
stats, threshes = predict_and_save(boost, model_name='boostedrules')

In [None]:
[(r, w) for r, w in boost.rules_]

In [None]:
saps = imodels.SaplingSumClassifier(max_rules=20)
saps.fit(X_train.values, y_train, feature_names=feature_names)
stats, threshes = predict_and_save(saps, model_name='saps')

In [None]:
print(saps)

In [None]:
stats['spec'][1], stats['sens'][1]

## neural net

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(100, ))
nn.fit(X_train, y_train)
stats, threshes = predict_and_save(nn, model_name='nn')

**CORELS**

In [None]:
# corels = imodels.OptimalRuleListClassifier(
#     c=0.0000001, n_iter=100000, map_type="prefix", policy="dfs", verbosity=[], ablation=0, max_card=2, min_support=0.01, random_state=5)
# corels.fit(X_train, y_train, feature_names=feature_names)
# stats, threshes = predict_and_save(corels, model_name='corels')
# print(corels)

### Stablerules

In [None]:
from imodels.experimental import stablelinear

In [None]:
weak_learners = [rulefit, skope, boost]

In [None]:
stbl = stablelinear.StableLinearClassifier(weak_learners=weak_learners, max_complexity=-1, alpha=0.1, max_rules=None, penalty='l1')
stbl.fit(X_train, y_train)

In [None]:
stats, threshes = predict_and_save(stbl, model_name='stbl')

In [None]:
stbl.rules_

In [None]:
stats['sens'][1], stats['spec'][1]

# look at all the results

In [None]:
def plot_metrics(suffix, title=None, fs=15):
    for fname in sorted(os.listdir(MODELS_DIR)):
        if 'pkl' in fname:
            if not fname[:-4] == 'rf':
                r = pkl.load(open(oj(MODELS_DIR, fname), 'rb'))
                #         print(r)
                #                 print(r.keys())

                threshes = np.array(r['threshes' + suffix])
                sens = np.array(r['sens' + suffix])
                spec = np.array(r['spec' + suffix])
                plt.plot(100 * sens, 100 * spec, 'o-', label=fname[:-4], alpha=0.6, markersize=3)
                plt.xlabel('Sensitivity (%)', fontsize=fs)
                plt.ylabel('Specificity (%)', fontsize=fs)
                s = suffix[1:]
                if title is None:
                    plt.title(f'{s}\n{data_sizes[s][0]} IAI-I / {data_sizes[s][1]}')
                else:
                    plt.title(title, fontsize=fs)

                # print best results
                if suffix == '_test2':
                    idxs = (sens > 0.95) & (spec > 0.43)
                    if np.sum(idxs) > 0:
                        idx_max = np.argmax(spec[idxs])
                        print(fname, f'{100 * sens[idxs][idx_max]:0.2f} {100 * spec[idxs][idx_max]:0.2f}')

    if suffix == '_train':
        plt.plot(97.79, 31.05, 'o', color='black', label='Original CDR', ms=4)
    if suffix == '_tune':
        plt.plot(96.19, 31.78, 'o', color='black', label='Original CDR', ms=4)
    plt.grid()


suffixes = ['_train', '_tune']  # _train, _test1, _test2, _cv
titles = ['Train (CSI PECARN)', 'Tune (CSI PECARN)']
R, C = 1, len(suffixes)
plt.figure(dpi=200, figsize=(C * 2.5, R * 3), facecolor='w')
fs = 10
for i, suffix in enumerate(suffixes):
    ax = plt.subplot(R, C, i + 1)
    plot_metrics(suffix, title=titles[i], fs=fs)
    if i > 0:
        plt.ylabel('')
        plt.yticks([0, 25, 50, 75, 100], labels=[''] * 5)
    #         ax.yaxis.set_visible(False)
    plt.xlim((50, 101))
    plt.ylim((0, 101))
plt.tight_layout()
# plt.subplot(R, C, 1)
# plt.legend(fontsize=20)
plt.legend(bbox_to_anchor=(1.1, 1), fontsize=fs, frameon=False)
#plt.savefig('figs/metrics_3_splits')
plt.show()