In [2]:
%load_ext autoreload
%autoreload 2
import os
import pickle as pkl
from functools import partial
from os.path import join as oj

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
import sklearn as sk

import imodels
from imodels.util import data_util
from imodels.discretization import discretizer, simple

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['figure.dpi'] = 250

# change working directory to project root
while os.getcwd().split('/')[-1] != 'imodels-experiments':
    os.chdir('..')

import viz
# from experiments.util import get_comparison_result

np.random.seed(0)

In [89]:
X, y, feature_names = data_util.get_clean_dataset('compas_two_year_clean.csv', data_source='imodels')

In [90]:
eb_disc = discretizer.ExtraBasicDiscretizer(dcols=feature_names[:7], n_bins=2, strategy='uniform')

In [91]:
X_disc_df = eb_disc.fit_transform(pd.DataFrame(X, columns=feature_names))
X, feature_names = X_disc_df.values.astype(int), X_disc_df.columns.values
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=0)

## try out corels

In [92]:
corels = imodels.OptimalRuleListClassifier(c=0.01, n_iter=100000)
corels.fit(X_train, y_train, feature_names=feature_names.tolist())

In [93]:
print(sk.metrics.accuracy_score(y_test, corels.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, corels.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, corels.predict_proba(X_test)[:, 1]))

0.5910563836681789
0.5877727823802789
0.5295720756947329


In [94]:
corels.rl()

RULELIST:
if [age_cat:25_-_45 && not race:African-American]:
  prediction = False
else if [sex:Male && not age_cat:Greater_than_45]:
  prediction = True
else 
  prediction = False
All features: (['age_57.0_to_96.0', 'priors_count_19.0_to_38.0', 'days_b_screening_arrest_0.0_to_30.0', 'c_jail_time_399.0_to_799.0', 'juv_fel_count_10.0_to_20.0', 'juv_other_count_4.5_to_9.0', 'juv_misd_count_6.5_to_13.0', 'c_charge_degree:F', 'c_charge_degree:M', 'race:African-American', 'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Native_American', 'race:Other', 'age_cat:25_-_45', 'age_cat:Greater_than_45', 'age_cat:Less_than_25', 'sex:Female', 'sex:Male'])

## try out gosdt

In [65]:
# X, y, feature_names = data_util.get_clean_dataset('credit_card_clean.csv', data_source='imodels')
# dataframe = pd.DataFrame(np.concatenate((X, np.expand_dims(y, axis=1)), axis=1), columns=list(feature_names) + ['target'])

# X = dataframe.loc[:1000, dataframe.columns[:5]]
# y = dataframe.loc[:1000, dataframe.columns[-1:]]
# X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=0)

In [66]:
gosdt = imodels.OptimalTreeClassifier(
    regularization=0.002,
    time_limit=30)

gosdt.fit(X_train, y_train)
print("Execution Time: {}".format(gosdt.time_))

Execution Time: 37.8120002746582


In [79]:
prediction = gosdt.predict(X_test)
# accuracy = sk.metrics.accuracy_score(y_test, prediction)
# print("Test Accuracy: {}".format(accuracy))
print(sk.metrics.accuracy_score(y_test, gosdt.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, gosdt.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, gosdt.predict_proba(X_test)[:, 1]))
print(gosdt.tree_)

0.5962410887880751
0.44391964706674747
0.4797396596929082
if X_0 = 1 and X_39 = 1 then:
    predicted target: 0
    misclassification penalty: 0.051
    complexity penalty: 0.002

else if X_0 = 1 and X_39 != 1 then:
    predicted target: 1
    misclassification penalty: 0.229
    complexity penalty: 0.002

else if X_0 != 1 and X_4 = 1 then:
    predicted target: 0
    misclassification penalty: 0.086
    complexity penalty: 0.002

else if X_0 != 1 and X_4 != 1 then:
    predicted target: 1
    misclassification penalty: 0.011
    complexity penalty: 0.002


## rulefit benchmark

In [95]:
rulefit = imodels.RuleFitClassifier(alpha=1, n_estimators=5, include_linear=False, random_state=0)
rulefit.fit(X_train, y_train, feature_names=feature_names)

  warn("Ignoring max_rules parameter since alpha passed explicitly")


RuleFitClassifier(alpha=1, include_linear=False, n_estimators=5, random_state=0)

In [96]:
print(sk.metrics.accuracy_score(y_test, rulefit.predict(X_test)))
print(sk.metrics.roc_auc_score(y_test, rulefit.predict_proba(X_test)[:, 1]))
print(sk.metrics.average_precision_score(y_test, rulefit.predict_proba(X_test)[:, 1]))

0.5236552171095269
0.6032952785074427
0.5471815616200513


In [88]:
rulefit.get_rules()

Unnamed: 0,rule,type,coef,support,importance
0,age_18.0_to_37.5 <= 0.5 and priors_count_0.0_t...,rule,-0.215274,0.080363,0.058523
1,age_18.0_to_37.5 <= 0.5,rule,-0.394429,0.31994,0.183983
2,age_18.0_to_37.5 > 0.5 and sex:Male <= 0.5,rule,-0.169085,0.127673,0.056428
3,age_cat:Less_than_25 <= 0.5 and priors_count_0...,rule,-0.363739,0.691078,0.168065
4,age_18.0_to_37.5 > 0.5 and priors_count_0.0_to...,rule,-0.028657,0.623461,0.013885
5,age_18.0_to_37.5 > 0.5 and sex:Male > 0.5,rule,0.491793,0.552387,0.244543
6,priors_count_0.0_to_9.5 <= 0.5,rule,1.288002,0.093757,0.37544


## BRS benchmark

In [248]:
brs = imodels.BoostedRulesClassifier(n_estimators=1, estimator=partial(sk.tree.DecisionTreeClassifier, max_depth=1))
brs.fit(X_train, y_train, feature_names=feature_names)

BoostedRulesClassifier(n_estimators=1)

In [249]:
brs.rules_

[race:African-American <= 0.5, race:African-American > 0.5]

In [104]:
a = {'ye': 1}
b = {'yes': 2, **a}

In [105]:
b

{'yes': 2, 'ye': 1}

In [107]:
for c, d in b.items():
    print(c, d)

yes 2
ye 1


In [187]:
a.copy().pop('ye')

1

In [188]:
a

{'ye': 1}

In [175]:
from collections import OrderedDict
import itertools

In [183]:
random_forest_grid = OrderedDict({
    'n_estimators': np.arange(1, 10),
    'max_samples': [0.8, 0.9, 1.0],
    'max_depth': [2, 3]
})

In [184]:
random_forest_grid

AttributeError: 'collections.OrderedDict' object has no attribute 'remove'

In [177]:
def grid_to_kwargs(grid: OrderedDict):
    all_kwargs = []

    for args_combo in itertools.product(*grid.values()):
        curr_kwargs = {list(grid.keys())[i]: args_combo[i] for i in range(len(grid))}
        all_kwargs.append(curr_kwargs)

    return all_kwargs

In [None]:
grid_to_kwargs(random_forest_grid)

In [8]:
!python 01_run_comparisons.py --config rulevetting --dataset csi --splitting_strategy cv --model saps

dset csi ['csi']
running datasets ['csi'] estimators [[saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps]]
saving to /accounts/projects/binyu/keyan3/imodels-experiments/results
  0%|                                                     | 0/1 [00:00<?, ?it/s]	dataset csi ests [saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps, saps]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start,

In [10]:
fname = oj('results', 'rulevetting', 'csi', 'cv/saps_comparisons.pkl')
saps_df = pkl.load(open(fname, 'rb'))['df']

In [16]:
saps_df.loc[:, ['train' in col for col in saps_df.columns]]

Unnamed: 0,csi_rocauc_train,csi_vars_train,csi_complexity_train,csi_time_train,csi_accuracy_train,csi_f1_train,csi_recall_train,csi_precision_train,csi_avg_precision_train,csi_best_accuracy_train,csi_best_spec_0.96_sens_train,csi_best_spec_0.98_sens_train
saps,0.660132,"{'max_rules': 1, 'posthoc_ridge': False, 'incl...",1,0.024599,0.835734,0.0,0.0,0.0,0.284576,0.835734,0.0,0.0
saps,0.700489,"{'max_rules': 2, 'posthoc_ridge': False, 'incl...",2,0.06916,0.851304,0.17316,0.094787,1.0,0.372467,0.851304,0.0,0.0
saps,0.742442,"{'max_rules': 3, 'posthoc_ridge': False, 'incl...",3,0.128966,0.864149,0.377897,0.251185,0.76259,0.445181,0.864149,0.0,0.0
saps,0.75093,"{'max_rules': 4, 'posthoc_ridge': False, 'incl...",4,0.211567,0.866874,0.414384,0.28673,0.746914,0.469153,0.866874,0.0,0.0
saps,0.775302,"{'max_rules': 5, 'posthoc_ridge': False, 'incl...",5,0.312551,0.866096,0.424749,0.300948,0.721591,0.499819,0.866096,0.0,0.0
saps,0.783636,"{'max_rules': 6, 'posthoc_ridge': False, 'incl...",6,0.412265,0.866874,0.426174,0.300948,0.729885,0.512772,0.866874,0.013973,0.013973
saps,0.786402,"{'max_rules': 7, 'posthoc_ridge': False, 'incl...",7,0.516771,0.873881,0.472313,0.343602,0.755208,0.541563,0.873881,0.013973,0.013973
saps,0.793423,"{'max_rules': 8, 'posthoc_ridge': False, 'incl...",8,0.613601,0.877773,0.480132,0.343602,0.796703,0.558828,0.877773,0.022823,0.022823
saps,0.794501,"{'max_rules': 9, 'posthoc_ridge': False, 'incl...",9,0.730142,0.880498,0.497545,0.36019,0.804233,0.571103,0.880498,0.022823,0.022823
saps,0.820393,"{'max_rules': 10, 'posthoc_ridge': False, 'inc...",10,0.862041,0.876995,0.437722,0.291469,0.878571,0.602321,0.880498,0.019562,0.019562


In [None]:
# fname = oj('results', 'saps', 'recidivism', 'train-test/Rulefit_comparisons.pkl')
# pkl.load(open(fname, 'rb'))['df']