In [1]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from functools import partial
from os.path import join as oj

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
import sklearn as sk

import imodels
from imodels.util import data_util
from imodels.discretization import discretizer, simple

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
while os.getcwd().split('/')[-1] != 'imodels-experiments':
    os.chdir('..')

import viz
from local_models.stable import StableLinearClassifier as stbl_local
# from experiments.util import get_comparison_result

np.random.seed(0)

In [2]:
X, y, feature_names = data_util.get_clean_dataset('juvenile_clean.csv', data_source='imodels')

In [113]:
eb_disc = discretizer.ExtraBasicDiscretizer(dcols=feature_names[:12], n_bins=8, strategy='uniform')
X_disc_df = eb_disc.fit_transform(pd.DataFrame(X, columns=feature_names))
X, feature_names = X_disc_df.values.astype(int), X_disc_df.columns.values
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=0)

In [4]:
# X_disc_df = eb_disc.fit_transform(pd.DataFrame(X, columns=feature_names))
# feature_names

## try out corels

In [115]:
corels = imodels.OptimalRuleListClassifier(c=0.000001, n_iter=100000)
corels.fit(X_train, y_train, feature_names=feature_names.tolist())

CorelsClassifier ({'c': 1e-06, 'n_iter': 100000, 'map_type': 'prefix', 'policy': 'lower_bound', 'verbosity': [], 'ablation': 0, 'max_card': 2, 'min_support': 0.01})
RULELIST:
if [fr_forced_sex:3 && not drugs_in_comm:1]:
  prediction = False
else if [friends_broken_in_steal:1 && not fr_suggest_agnts_law:2]:
  prediction = True
else 
  prediction = False
All features: (['age_12.0_to_12.625', 'age_12.625_to_13.25', 'age_13.875_to_14.5', 'age_14.5_to_15.125', 'age_15.75_to_16.375', 'age_16.375_to_17.0', '#_in_household_1.0_to_6.25', '#_in_household_6.25_to_11.5', '#_in_household_11.5_to_16.75', '#_in_household_16.75_to_22.0', '#_in_household_22.0_to_27.25', '#_in_household_32.5_to_37.75', '#_in_household_37.75_to_43.0', 'weighting_95_0.19689999520778656_to_0.8735375124961138', 'weighting_95_0.8735375124961138_to_1.550175029784441', 'weighting_95_1.550175029784441_to_2.226812547072768', 'weighting_95_2.226812547072768_to_2.9034500643610954', 'weighting_95_2.9034500643610954_to_3.58008758164

In [116]:
print(sk.metrics.accuracy_score(y_test, corels.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, corels.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, corels.predict_proba(X_test)[:, 1]))

0.8769230769230769
0.6534243898202032
0.2682044966527725


In [117]:
corels.rl_

RULELIST:
if [fr_forced_sex:3 && not drugs_in_comm:1]:
  prediction = False
else if [friends_broken_in_steal:1 && not fr_suggest_agnts_law:2]:
  prediction = True
else 
  prediction = False
All features: (['age_12.0_to_12.625', 'age_12.625_to_13.25', 'age_13.875_to_14.5', 'age_14.5_to_15.125', 'age_15.75_to_16.375', 'age_16.375_to_17.0', '#_in_household_1.0_to_6.25', '#_in_household_6.25_to_11.5', '#_in_household_11.5_to_16.75', '#_in_household_16.75_to_22.0', '#_in_household_22.0_to_27.25', '#_in_household_32.5_to_37.75', '#_in_household_37.75_to_43.0', 'weighting_95_0.19689999520778656_to_0.8735375124961138', 'weighting_95_0.8735375124961138_to_1.550175029784441', 'weighting_95_1.550175029784441_to_2.226812547072768', 'weighting_95_2.226812547072768_to_2.9034500643610954', 'weighting_95_2.9034500643610954_to_3.5800875816494226', 'weighting_95_3.5800875816494226_to_4.25672509893775', 'weighting_95_4.25672509893775_to_4.933362616226077', 'weighting_95_4.933362616226077_to_5.61000013351

In [118]:
corels.complexity_

5

## try out gosdt

In [11]:
# X, y, feature_names = data_util.get_clean_dataset('credit_card_clean.csv', data_source='imodels')
# dataframe = pd.DataFrame(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1), columns=list(feature_names) + ['target'])

# X = dataframe.loc[:1000, dataframe.columns[:5]]
# y = dataframe.loc[:1000, dataframe.columns[-1:]]
# X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=0)

In [12]:
# gosdt = imodels.OptimalTreeClassifier(
#     regularization=0.002,
#     time_limit=30)

# gosdt.fit(X_train, y_train)
# print("Execution Time: {}".format(gosdt.time_))

In [13]:
# prediction = gosdt.predict(X_test)
# # accuracy = sk.metrics.accuracy_score(y_test, prediction)
# # print("Test Accuracy: {}".format(accuracy))
# print(sk.metrics.accuracy_score(y_test, gosdt.predict(X_test)))
# print(sk.metrics.roc_auc_score(y_test, gosdt.predict_proba(X_test)[:, 1]))
# print(sk.metrics.average_precision_score(y_test, gosdt.predict_proba(X_test)[:, 1]))
# print(gosdt.tree_)

## rulefit benchmark

In [75]:
rulefit = imodels.RuleFitClassifier(max_rules=25, n_estimators=100, include_linear=True, random_state=0, cv=False)
rulefit.fit(X_train, y_train, feature_names=feature_names)

RuleFitClassifier(cv=False, max_rules=25, random_state=0)

In [76]:
# print(sk.metrics.accuracy_score(y_test, rulefit.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, rulefit.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, rulefit.predict_proba(X_test)[:, 1]))

0.7651096801194914
0.5280743178182636


In [None]:
rulefit.visualize()

In [33]:
rulefit.complexity_

18

## BRS benchmark

In [81]:
brs = imodels.BoostedRulesClassifier(n_estimators=3, estimator=partial(sk.tree.DecisionTreeClassifier, max_leaf_nodes=3))
brs.fit(X_train, y_train, feature_names=feature_names)

BoostedRulesClassifier(estimator=functools.partial(<class 'sklearn.tree._classes.DecisionTreeClassifier'>, max_leaf_nodes=3),
                       n_estimators=3)

In [82]:
brs.rules_

[pay_0_1.75_to_3.0 <= 0.5 and pay_2_1.75_to_3.0 <= 0.5,
 pay_0_1.75_to_3.0 <= 0.5 and pay_2_1.75_to_3.0 > 0.5,
 pay_0_1.75_to_3.0 > 0.5,
 pay_0_0.5_to_1.75 <= 0.5 and pay_0_3.0_to_4.25 <= 0.5,
 pay_0_0.5_to_1.75 <= 0.5 and pay_0_3.0_to_4.25 > 0.5,
 pay_0_0.5_to_1.75 > 0.5,
 limit_bal_10000.0_to_133750.0 <= 0.5 and pay_3_1.75_to_3.0 <= 0.5,
 limit_bal_10000.0_to_133750.0 > 0.5 and pay_3_1.75_to_3.0 <= 0.5,
 pay_3_1.75_to_3.0 > 0.5]

## saps

In [40]:
saps = imodels.SaplingSumClassifier(max_rules=5)
saps.fit(X_train, y_train, feature_names=feature_names)

SaplingSumClassifier(max_rules=5)

In [44]:
# saps.trees_

In [41]:
saps.complexity_

5

### grl

In [89]:
grl = imodels.GreedyRuleListClassifier(max_depth=7, class_weight={0: 1, 1: 3}, criterion='neg_corr')
par_node = grl.fit(X_train, y_train, feature_names=feature_names)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [90]:
print(sk.metrics.roc_auc_score(y_test, grl.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, grl.predict_proba(X_test)[:, 1]))

0.7468861522113959
0.47854772522945055


In [92]:
len(grl.rules_)

7

### skope

In [37]:
skope = imodels.SkopeRulesClassifier(precision_min=0.7, max_depth=2)
skope.fit(X_train, y_train, feature_names=feature_names)

SkopeRulesClassifier(max_depth=2, precision_min=0.7)

In [38]:
skope.rules_

[education:5 <= 0.5 and pay_0_3.0_to_8.0_0.875_to_1.0 > 0.5,
 education:1 <= 0.5 and pay_0_3.0_to_8.0_0.875_to_1.0 > 0.5,
 pay_0_3.0_to_8.0_0.875_to_1.0 > 0.5 and pay_2_3.0_to_8.0_0.875_to_1.0 <= 0.5,
 pay_2_3.0_to_8.0_0.875_to_1.0 <= 0.5 and pay_5_3.0_to_8.0_0.875_to_1.0 > 0.5,
 pay_0_3.0_to_8.0_0.875_to_1.0 <= 0.5 and pay_5_3.0_to_8.0_0.875_to_1.0 > 0.5,
 pay_5_3.0_to_8.0_0.875_to_1.0 <= 0.5 and pay_6_3.0_to_8.0_0.875_to_1.0 > 0.5]

In [39]:
skope.complexity_

12

## stablerules

In [60]:
feature_names

array(['limit_bal_505000.0_to_1000000.0_0.875_to_1.0',
       'age_50.0_to_79.0_0.875_to_1.0', 'pay_0_3.0_to_8.0_0.875_to_1.0',
       'pay_2_3.0_to_8.0_0.875_to_1.0', 'pay_3_3.0_to_8.0_0.875_to_1.0',
       'pay_4_3.0_to_8.0_0.875_to_1.0', 'pay_5_3.0_to_8.0_0.875_to_1.0',
       'pay_6_3.0_to_8.0_0.875_to_1.0',
       'bill_amt1_399465.5_to_964511.0_0.875_to_1.0',
       'bill_amt2_457077.0_to_983931.0_0.875_to_1.0',
       'bill_amt3_753412.5_to_1664089.0_0.875_to_1.0',
       'bill_amt4_360793.0_to_891586.0_0.875_to_1.0',
       'bill_amt5_422918.5_to_927171.0_0.875_to_1.0',
       'bill_amt6_311030.5_to_961664.0_0.875_to_1.0',
       'pay_amt1_436776.0_to_873552.0_0.875_to_1.0',
       'pay_amt2_842129.5_to_1684259.0_0.875_to_1.0',
       'pay_amt3_448020.0_to_896040.0_0.875_to_1.0',
       'pay_amt4_310500.0_to_621000.0_0.875_to_1.0',
       'pay_amt5_213264.5_to_426529.0_0.875_to_1.0',
       'pay_amt6_264333.0_to_528666.0_0.875_to_1.0', 'sex:1', 'sex:2',
       'education:0', 'e

In [83]:
submodel_dfs = [
    pkl.load(open(oj('results/stablerules/credit/cv', f'{submodel}_comparisons.pkl'), 'rb'))['df']
    for submodel in ['rulefit', 'skope_rules', 'brs']]

In [85]:
stbl = stbl_local(
    max_complexity=20, metric='avg_precision', include_linear=True, max_rules=5, cv=False, p_filtering=None, random_state=0)
stbl.set_rules(submodel_dfs, '_train')
stbl.fit(X_train, y_train, feature_names=feature_names)

StableLinearClassifier(cv=False, include_linear=True, max_complexity=20,
                       max_rules=5, metric='avg_precision', random_state=0)

In [86]:
print(sk.metrics.roc_auc_score(y_test, stbl.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, stbl.predict_proba(X_test)[:, 1]))

0.7403195045912025
0.46412212457352703


In [87]:
stbl.visualize()

Unnamed: 0,rule,coef
0,limit_bal_10000.0_to_133750.0,0.09
17,pay_0_-0.75_to_0.5,-0.17
162,pay_0_1.75_to_3.0 <= 0.5 and pay_2_1.75_to_3.0 <= 0.5,-0.84
163,pay_0_1.75_to_3.0 <= 0.5 and pay_3_1.75_to_3.0 <= 0.5,-0.75


## Running experiments 

In [19]:
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model brs --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model grl --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model random_forest --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model gradient_boosting --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model saps --ignore_cache
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model cart --ignore_cache

dset csi ['csi']
running datasets ['csi'] estimators [[brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs]]
saving to /Users/keyan/bair/imodels-experiments/results
  0%|                                                     | 0/1 [00:00<?, ?it/s]	dataset csi ests [brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs, brs]

  0%|                                                    | 0/36 [00:00<?, ?it/s][A
  3%|█▏                                          | 1/36 [00:00<00:05,  5.85it/s][A
  6%|██▍                                         | 2/36 [00:00<00:06,  5.22it/s][A
  8%|███▋                                        | 3/36 [00:00<00:07,  4.54it/s][A
 11%|████▉                                       | 4/36 [00:00<00:07,  4.57it/s][A
 14%|███

In [56]:
fname = oj('results', 'stablerules', 'credit', 'cv/brs_comparisons.pkl')
saps_df = pkl.load(open(fname, 'rb'))['df']
# saps_df.loc[:, ['avg_prec' in col or 'id' in col or 'n_est' in col for col in saps_df.columns]]
# df = saps_df.loc[:, ['spec' in col or 'vars_train' in col for col in saps_df.columns]]
# df[df['best_spec_0.9_sens_cv_mean'] > 0.3]['csi_vars_train']
# df
saps_df['vars_train'].iloc[10]['rules_']

[pay_0_1.75_to_3.0 <= 0.5 and pay_2_1.75_to_3.0 <= 0.5,
 pay_0_1.75_to_3.0 <= 0.5 and pay_2_1.75_to_3.0 > 0.5,
 pay_0_1.75_to_3.0 > 0.5,
 pay_0_0.5_to_1.75 <= 0.5 and pay_0_3.0_to_4.25 <= 0.5,
 pay_0_0.5_to_1.75 <= 0.5 and pay_0_3.0_to_4.25 > 0.5,
 pay_0_0.5_to_1.75 > 0.5,
 limit_bal_10000.0_to_133750.0 <= 0.5 and pay_3_1.75_to_3.0 <= 0.5,
 limit_bal_10000.0_to_133750.0 > 0.5 and pay_3_1.75_to_3.0 <= 0.5,
 pay_3_1.75_to_3.0 > 0.5,
 pay_0_1.75_to_3.0 <= 0.5 and pay_6_-0.75_to_0.5 <= 0.5,
 pay_0_1.75_to_3.0 > 0.5 and pay_6_-0.75_to_0.5 <= 0.5,
 pay_6_-0.75_to_0.5 > 0.5]

In [16]:
saps_df['best_spec_0.95_sens_cv_mean'].argsort()[-2]

92

In [64]:
df = saps_df.loc[:, ['mean' in col or 'vars_train' in col for col in saps_df.columns]]
good_cart_hps = df[df['best_spec_0.98_sens_cv_mean'] > 0.3]['csi_vars_train']

In [17]:
from util import get_best_model_rules_under_complexity

In [21]:
get_best_model_rules_under_complexity(
    oj('results', 'rulevetting', 'csi', 'cv/rulefit_comparisons.pkl'),
    'best_spec_0.98_sens',
    'fold_0',
    5
)

[FocalNeuroFindings2 <= 0.5 and Torticollis2 <= 0.5,
 FocalNeuroFindings2 <= 0.5 and Torticollis2 > 0.5]