In [1]:
%cd ../../../..
%pwd

/Users/jbbutler129/Google Drive (butlerj@berkeley.edu)/Classes/215A/final_project/rule-vetting


'/Users/jbbutler129/Google Drive (butlerj@berkeley.edu)/Classes/215A/final_project/rule-vetting'

In [2]:
# importing necessary packages

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import rulevetting.api.viz as viz
from rulevetting.projects.tbi_pecarn.dataset import Dataset

import os
import pickle as pkl
from os.path import join as oj

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, plot_tree

import imodels
from rulevetting.api import validation


MODELS_DIR = './models'
os.makedirs(MODELS_DIR, exist_ok=True)

outcome_def = 'outcome'  # output

## Loading Data with Default Judgement Calls

In [3]:
df_train_1, df_tune_1, df_test_1 = Dataset().get_data()

kwargs {'clean_data': {}, 'preprocess_data': {'step1_injMech': False, 'step5_missSubGCS': True, 'step5_fake15GCS': True, 'step5_fake14GCS': True, 'step8_missingOSI': True, 'step9_HEMA': 3, 'step10_cautiousUncl': True, 'step14_vomitDtls': False, 'step15_HA': 2, 'step15_HAStart': False, 'step16_Seiz': 2, 'step16_SeizOccur': False, 'step17_cautiousUncl': True, 'step19_Drugs': False, 'step20_ActNormal': True}, 'extract_features': {'HEMA_umbrella': False, 'SFxPalp_umbrella': False, 'SFxBas_umbrella': False, 'AMS_umbrella': False, 'Clav_umbrella': False, 'NeuroD_umbrella': False, 'Vomit_umbrella': False, 'HA_umbrella': False, 'Seiz_umbrella': False, 'LOC_umbrella': False, 'remove_constVal': True}}


## Classifier: CORELS (Certifiably Optimal Rule Lists Classifer)

(as opposed to CART, gives a certificate of optimality)

First, I will run the classifier on just the larger umbrella variables, as opposed to the subvariables because that seemed to yield complicated results for Xin

In [4]:
keys_to_keep = ['LOCSeparate',
 'Seiz',
 'ActNorm',
 'Vomit',
 'GCSEye',
 'GCSVerbal',
 'GCSMotor',
 'AMS',
 'SFxPalp',
 'FontBulg',
 'SFxBas',
 'Hema',
 'Clav',
 'NeuroD',
 'OSI',
 'High_impact_InjSev_1',
 'High_impact_InjSev_2',
 'High_impact_InjSev_3',
 'Amnesia_verb_0',
 'Amnesia_verb_1',
 'Amnesia_verb_91',
 'HA_verb_0',
 'HA_verb_1',
 'HA_verb_91']

### Should we implement this?

NOTE: Binary recoding of the GCS variables (either full score or not)

In [44]:
def recode_GCS(df):
    
    keys = list(df.keys())
    
    for key in keys:
        if 'GCS' in key:
            unique_vals = tuple(df[key].unique())
            replace = (1, 0)
            df[key].replace(unique_vals, replace, inplace = True)

In [45]:
recode_GCS(df_train_1)
recode_GCS(df_tune_1)
recode_GCS(df_test_1)

In [46]:
umbrella_subset_tr = df_train_1[keys_to_keep]
umbrella_subset_tu = df_tune_1[keys_to_keep]
umbrella_subset_te = df_test_1[keys_to_keep]

In [92]:
# penalization parameter: penalizes longer rulelists
c = 0.001
# maximum number of rulelists to search for before exiting
n_iter = 100000
# policy: search policy for traversing the tree
policy = 'dfs'

corel = imodels.OptimalRuleListClassifier(c, n_iter, policy = policy)

In [93]:
corel.fit(umbrella_subset_tr, df_train_1['outcome'], prediction_name = 'outcome')

Interesting, so the rule list that it is generating is simply that the outcome is always False. Will probably have to tune this so that it doesn't misclassify..

In [94]:
corel.rl()

RULELIST:
outcome = False
All features: (['LOCSeparate', 'Seiz', 'ActNorm', 'Vomit', 'GCSEye', 'GCSVerbal', 'GCSMotor', 'AMS', 'SFxPalp', 'FontBulg', 'SFxBas', 'Hema', 'Clav', 'NeuroD', 'OSI', 'High_impact_InjSev_1', 'High_impact_InjSev_2', 'High_impact_InjSev_3', 'Amnesia_verb_0', 'Amnesia_verb_1', 'Amnesia_verb_91', 'HA_verb_0', 'HA_verb_1', 'HA_verb_91'])

## What if I made the outcomes a bit more balanced, what would it give me?

In [99]:
trues = df_train_1[df_train_1['outcome'] == 1]
falses = df_train_1[df_train_1['outcome'] == 0][1:150]

In [100]:
balanced_set = pd.concat([trues, falses])
outcome = balanced_set['outcome']
balanced_set = balanced_set[keys_to_keep]

In [105]:
recode_GCS(balanced_set)

In [113]:
c = 0.0001
# maximum number of rulelists to search for before exiting
n_iter = 1000000
policy = 'dfs'
corel = imodels.OptimalRuleListClassifier(c, n_iter, class_weight)
corel.fit(balanced_set, outcome, prediction_name = 'outcome')

In [114]:
corel.rl()

RULELIST:
if [SFxPalp && not Hema]:
  outcome = True
else if [SFxPalp && High_impact_InjSev_3]:
  outcome = True
else if [not AMS && not SFxBas]:
  outcome = False
else 
  outcome = True
All features: (['LOCSeparate', 'Seiz', 'ActNorm', 'Vomit', 'GCSEye', 'GCSVerbal', 'GCSMotor', 'AMS', 'SFxPalp', 'FontBulg', 'SFxBas', 'Hema', 'Clav', 'NeuroD', 'OSI', 'High_impact_InjSev_1', 'High_impact_InjSev_2', 'High_impact_InjSev_3', 'Amnesia_verb_0', 'Amnesia_verb_1', 'Amnesia_verb_91', 'HA_verb_0', 'HA_verb_1', 'HA_verb_91'])

In [98]:
corel.predict(umbrella_subset_tu)

array([0, 0, 0, ..., 0, 0, 0])

Recode the outcome as 1 if you don't have a TBI, 0 otherwise?

## Classifier: GOSDT

In [117]:
gosdt = imodels.OptimalTreeClassifier(rule_list = True)
gosdt.fit(umbrella_subset_tr, df_train_1['outcome'])

  "Should install gosdt C++ extenstion. On x86_64 linux or macOS: "


<imodels.tree.gosdt.pygosdt.OptimalTreeClassifier at 0x7f87509ba050>

## Classifier: Bayesian Rule Classifier

In [43]:
bayes = imodels.BayesianRuleSetClassifier()
bayes.fit(umbrella_subset_tr, df_train_1['outcome'], verbose = True)

KeyboardInterrupt: 

### Making sure Xin's stuff works for mine as well

In [16]:
rulefit = imodels.RuleFitRegressor(max_rules=5)
rulefit.fit(umbrella_subset_tr, df_train_1['outcome'])

RuleFitRegressor(max_rules=5)

In [22]:
rules = rulefit.get_rules() 
rules = rules[rules['type']!='linear']
rules = rules[rules['coef'] != 0] 
rules = rules.sort_values('importance', ascending=False) 
rules

Unnamed: 0,rule,type,coef,support,importance
24,SFxBas <= 0.5 and GCSEye > 0.5 and GCSVerbal >...,rule,-0.033034,0.955683,0.006798
26,SFxBas <= 0.5 and NeuroD <= 0.5 and High_impac...,rule,-0.006262,0.845541,0.002263
27,ActNorm > 0.5,rule,-0.001079,0.872913,0.000359
25,LOCSeparate <= 0.5,rule,-0.000915,0.899734,0.000275
