In [3]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from functools import partial
from os.path import join as oj

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
import sklearn as sk

import imodels
from imodels.util import data_util
from imodels.discretization import discretizer, simple

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
while os.getcwd().split('/')[-1] != 'imodels-experiments':
    os.chdir('..')

import viz
from local_models.stable import StableLinearClassifier as stbl_local
# from experiments.util import get_comparison_result

np.random.seed(0)

In [4]:
X, y, feature_names = data_util.get_clean_dataset('csi_all.csv', data_source='imodels')

In [27]:
eb_disc = discretizer.ExtraBasicDiscretizer(dcols=feature_names[:7], n_bins=2, strategy='uniform')

In [6]:
# X_disc_df = eb_disc.fit_transform(pd.DataFrame(X, columns=feature_names))
# X, feature_names = X_disc_df.values.astype(int), X_disc_df.columns.values
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=0)

## try out corels

In [8]:
corels = imodels.OptimalRuleListClassifier(c=0.01, n_iter=100000)
corels.fit(X_train, y_train, feature_names=feature_names.tolist())

In [9]:
print(sk.metrics.accuracy_score(y_test, corels.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, corels.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, corels.predict_proba(X_test)[:, 1]))

0.5910563836681789
0.5877727823802789
0.5295720756947329


In [10]:
corels.rl()

RULELIST:
if [age_cat:25_-_45 && not race:African-American]:
  prediction = False
else if [sex:Male && not age_cat:Greater_than_45]:
  prediction = True
else 
  prediction = False
All features: (['age_57.0_to_96.0', 'priors_count_19.0_to_38.0', 'days_b_screening_arrest_0.0_to_30.0', 'c_jail_time_399.0_to_799.0', 'juv_fel_count_10.0_to_20.0', 'juv_other_count_4.5_to_9.0', 'juv_misd_count_6.5_to_13.0', 'c_charge_degree:F', 'c_charge_degree:M', 'race:African-American', 'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Native_American', 'race:Other', 'age_cat:25_-_45', 'age_cat:Greater_than_45', 'age_cat:Less_than_25', 'sex:Female', 'sex:Male'])

## try out gosdt

In [11]:
# X, y, feature_names = data_util.get_clean_dataset('credit_card_clean.csv', data_source='imodels')
# dataframe = pd.DataFrame(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1), columns=list(feature_names) + ['target'])

# X = dataframe.loc[:1000, dataframe.columns[:5]]
# y = dataframe.loc[:1000, dataframe.columns[-1:]]
# X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=0)

In [12]:
# gosdt = imodels.OptimalTreeClassifier(
#     regularization=0.002,
#     time_limit=30)

# gosdt.fit(X_train, y_train)
# print("Execution Time: {}".format(gosdt.time_))

In [13]:
# prediction = gosdt.predict(X_test)
# # accuracy = sk.metrics.accuracy_score(y_test, prediction)
# # print("Test Accuracy: {}".format(accuracy))
# print(sk.metrics.accuracy_score(y_test, gosdt.predict(X_test)))
# print(sk.metrics.roc_auc_score(y_test, gosdt.predict_proba(X_test)[:, 1]))
# print(sk.metrics.average_precision_score(y_test, gosdt.predict_proba(X_test)[:, 1]))
# print(gosdt.tree_)

## rulefit benchmark

In [5]:
rulefit = imodels.RuleFitClassifier(max_rules=25, n_estimators=1000, include_linear=True, random_state=0, cv=True)
rulefit.fit(X_train, y_train, feature_names=feature_names)

NameError: name 'X_train' is not defined

In [55]:
# print(sk.metrics.accuracy_score(y_test, rulefit.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, rulefit.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, rulefit.predict_proba(X_test)[:, 1]))

0.8064079116710696
0.46382688968672225


In [58]:
rulefit.visualize()

Unnamed: 0,rule,coef
49,Predisposed <= 0.5 and HighriskDiving <= 0.5 and MedsRecd2 <= 0.5,-0.16
41,FocalNeuroFindings2 <= 0.5 and Torticollis2 <= 0.5 and HighriskDiving <= 0.5,-0.15
48,FocalNeuroFindings2 <= 0.5 and Torticollis2 <= 0.5 and HighriskDiving <= 0.5,-0.42
52,subinj_TorsoTrunk2 <= 0.5 and MedsRecd2 <= 0.5,-0.11
51,subinj_TorsoTrunk2 <= 0.5 and MedsRecd2 <= 0.5,-0.09
53,axialloadtop <= 0.5 and Torticollis2 <= 0.5 and MedsRecd2 <= 0.5,-0.03
46,FocalNeuroFindings2 <= 0.5 and Torticollis2 <= 0.5 and subinj_TorsoTrunk2 <= 0.5,-0.07
40,axialloadtop <= 0.5 and FocalNeuroFindings2 <= 0.5 and HighriskDiving <= 0.5 and HighriskMVC <= 0.5,-0.39
38,FocalNeuroFindings2 <= 0.5 and subinj_TorsoTrunk2 <= 0.5 and MedsRecd2 <= 0.5,-0.1
37,Torticollis2 <= 0.5 and MedsRecd2 <= 0.5 and HighriskMVC <= 0.5,-0.03


In [33]:
rulefit.complexity_

18

## BRS benchmark

In [248]:
brs = imodels.BoostedRulesClassifier(n_estimators=1, estimator=partial(sk.tree.DecisionTreeClassifier, max_depth=1))
brs.fit(X_train, y_train, feature_names=feature_names)

BoostedRulesClassifier(n_estimators=1)

In [249]:
brs.rules_

[race:African-American <= 0.5, race:African-American > 0.5]

## saps

In [29]:
saps = imodels.SaplingSumClassifier(max_rules=5)
saps.fit(X_train, y_train, feature_names=feature_names)

SaplingSumClassifier(max_rules=5)

## stablerules

In [202]:
from sklearn.linear_model._coordinate_descent import _alpha_grid

In [13]:
submodel_dfs = [
    pkl.load(open(oj('results/rulevetting/csi/cv', f'{submodel}_comparisons.pkl'), 'rb'))['df']
    for submodel in ['rulefit', 'skope_rules', 'brs']]

In [16]:
stbl = stbl_local(
    max_complexity_brs=10, max_complexity_rulefit=10, max_complexity_skope_rules=10, metric='avg_precision', include_linear=True, max_rules=15,
    cv=False, p_filtering=None, random_state=0)
stbl.set_rules(submodel_dfs, '_fold_0')
stbl.fit(X_train, y_train, feature_names=feature_names)

StableLinearClassifier(cv=False, include_linear=True, max_complexity_brs=10,
                       max_complexity_rulefit=10, max_complexity_skope_rules=10,
                       max_rules=15, metric='avg_precision', random_state=0)

In [17]:
print(sk.metrics.roc_auc_score(y_test, stbl.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, stbl.predict_proba(X_test)[:, 1]))

0.8056007134954503
0.4587328404609205


In [18]:
stbl.visualize()

Unnamed: 0,rule,coef
7,HighriskMVC,0.47
12,AlteredMentalStatus2,0.49
13,FocalNeuroFindings2,0.57
14,PainNeck2,0.83
18,subinj_Head2,0.06
21,subinj_TorsoTrunk2,0.39
22,is_ems,-0.74
27,Position_S,-0.07
29,Immobilization2,0.55
30,MedsRecd2,0.37


## Running experiments 

In [19]:
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model brs --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model grl --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model random_forest --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model gradient_boosting --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model saps --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model cart --ignore_cache

dset csi ['csi']
running datasets ['csi'] estimators [[brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs]]
saving to /Users/keyan/bair/imodels-experiments/results
  0%|                                                     | 0/1 [00:00<?, ?it/s]	dataset csi ests [brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs]

  0%|                                                    | 0/36 [00:00<?, ?it/s][A
  3%|█▏                                          | 1/36 [00:00<00:05,  5.85it/s][A
  6%|██▍                                         | 2/36 [00:00<00:06,  5.22it/s][A
  8%|███▋                                        | 3/36 [00:00<00:07,  4.54it/s][A
 11%|████▉                                       | 4/36 [00:00<00:07,  4.57it/s][A
 14%|███

In [3]:
fname = oj('results', 'rulevetting', 'csi', 'cv/cart_comparisons.pkl')
saps_df = pkl.load(open(fname, 'rb'))['df']
saps_df.iloc[15]
#.loc[:, ['spec' in col or 'id' in col or 'n_est' in col for col in saps_df.columns]]
# df = saps_df.loc[:, ['spec' in col or 'vars_train' in col for col in saps_df.columns]]
# df[df['best_spec_0.9_sens_cv_mean'] > 0.3]['csi_vars_train']
# df

max_leaf_nodes                                                                 4
other_kwargs                   {'class_weight': {0: 1, 1: 100}, 'criterion': ...
curve_id                                          class_weight_100_criterion_ent
rocauc_fold_0                                                           0.748979
vars_fold_0                    {'criterion': 'entropy', 'splitter': 'best', '...
complexity_fold_0                                                              3
time_fold_0                                                             0.002704
accuracy_fold_0                                                          0.14786
f1_fold_0                                                               0.257627
recall_fold_0                                                                1.0
precision_fold_0                                                         0.14786
avg_precision_fold_0                                                    0.345737
best_accuracy_fold_0        

In [16]:
saps_df['best_spec_0.95_sens_cv_mean'].argsort()[-2]

92

In [64]:
df = saps_df.loc[:, ['mean' in col or 'vars_train' in col for col in saps_df.columns]]
good_cart_hps = df[df['best_spec_0.98_sens_cv_mean'] > 0.3]['csi_vars_train']

In [17]:
from util import get_best_model_rules_under_complexity

In [21]:
get_best_model_rules_under_complexity(
    oj('results', 'rulevetting', 'csi', 'cv/rulefit_comparisons.pkl'),
    'best_spec_0.98_sens',
    'fold_0',
    5
)

[FocalNeuroFindings2 <= 0.5 and Torticollis2 <= 0.5,
 FocalNeuroFindings2 <= 0.5 and Torticollis2 > 0.5]