In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sys 
import os

# this line walks up file directory so rule-vetting is cwd
# Currently this breaks if chunk is run multiple times
os.chdir(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))))
print(os.getcwd())

import rulevetting.api.viz as viz
from rulevetting.projects.csi_pecarn.av_dataset import Dataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/


In [4]:
avdf_all = Dataset().get_data(use_robust_av = False)
avdf_ran = Dataset().get_data(use_robust_av = False, control_types=['ran'])
avdf_moi = Dataset().get_data(use_robust_av = False, control_types=['moi'])
avdf_ems = Dataset().get_data(use_robust_av = False, control_types=['ems'])

avdf_ran_train = avdf_ran[0]
avdf_ran_tuning = avdf_ran[1]
avdf_ran_testing = avdf_ran[2]

avdf_all_train = avdf_all[0]
avdf_all_tuning = avdf_all[1]
avdf_all_testing = avdf_all[2]

avdf_ems_train = avdf_ems[0]
avdf_ems_tuning = avdf_ems[1]
avdf_ems_testing = avdf_ems[2]

avdf_moi_train = avdf_moi[0]
avdf_moi_tuning = avdf_moi[1]
avdf_moi_testing = avdf_moi[2]


preprocess_data kwargs {'frac_missing_allowed': 0.05}
split_data kwargs {'control_types': ['ran', 'moi', 'ems']}
['ran', 'moi', 'ems']


KeyError: 1

### Leonard (2011)

In [None]:
Leonard = ['AlteredMentalStatus','FocalNeuroFindings','PainNeck','Torticollis',
          'SubInj_TorsoTrunk','Predisposed','HighriskDiving','HighriskMVC']
RF = ['AlteredMentalStatus','FocalNeuroFindings','PainNeck','Torticollis',
          'LOC','NonAmbulatory','HighriskDiving','HighriskMVC']

In [None]:
v_list = listavdf_all_train.columns
v_list.remove('outcome')

In [None]:
# avdf_all_train['indicator'] = avdf_all_train[v_list].sum(axis = 1)
# avdf_all_train['indicator'].hist(by = avdf_all_train['outcome'])

In [None]:
def find_best(data, v_list, method = "gini"):
    '''
    find the best one to split the data from a variable list
    
    Parameters:
    data: same structure as what we get from Dataset().get_data()[0]
    v_list: names of variable names we are considering
    '''
    
    v = len(v_list)
    n = data.shape[0]
    
    score = [1]*v
    
    for i in range(v):
        
        variable = v_list[i]
        
        v1c1 = data[(data[variable] == 1) & (data['outcome'] == 1)].shape[0]
        v1c0 = data[(data[variable] == 1) & (data['outcome'] == 0)].shape[0]
        if (v1c1+v1c0) == 0:
            p1 = 1/2
        else:
            p1 = v1c1/(v1c1+v1c0)
        
        v0c1 = data[(data[variable] == 0) & (data['outcome'] == 1)].shape[0]
        v0c0 = data[(data[variable] == 0) & (data['outcome'] == 0)].shape[0]
        if (v0c1+v0c0) == 0:
            p2 = 1/2
        else:
            p2 = v0c1/(v0c1+v0c0)
        
        if method == 'gini':
            score[i] = (v1c1+v1c0)/n * p1 * (1-p1) + (v0c1+v0c0)/n * p2 * (1-p2)
            
        elif method == 'semi_gini':
            score[i] = 1-p1
            
        # print(variable, p1, score[i])
    
    ind = score.index(min(score))
    variable_best = v_list[ind]
    v_list.remove(variable_best)
    data_update = data[data[variable_best] == 0]
    
    return [variable_best, v_list, data_update]

def find_best_two(data, v_list, method = "gini"):
    '''
    find the best one to split the data from a variable list
    
    Parameters:
    data: same structure as what we get from Dataset().get_data()[0]
    v_list: names of variable names we are considering
    '''
    
    v = len(v_list)
    n = data.shape[0]
    
    score = [1]*v
    for i in range(v):
        variable = v_list[i]
        v1c1 = data[(data[variable] == 1) & (data['outcome'] == 1)].shape[0]
        v1c0 = data[(data[variable] == 1) & (data['outcome'] == 0)].shape[0]
        if (v1c1+v1c0) == 0:
            p1 = 1/2
        else:
            p1 = v1c1/(v1c1+v1c0)
        v0c1 = data[(data[variable] == 0) & (data['outcome'] == 1)].shape[0]
        v0c0 = data[(data[variable] == 0) & (data['outcome'] == 0)].shape[0]
        if (v0c1+v0c0) == 0:
            p2 = 1/2
        else:
            p2 = v0c1/(v0c1+v0c0)
        if method == 'gini':
            score[i] = (v1c1+v1c0)/n * p1 * (1-p1) + (v0c1+v0c0)/n * p2 * (1-p2)  
        elif method == 'semi_gini':
            score[i] = 1-p1   
        # print(variable, p1, score[i])
    ind = score.index(min(score))
    variable_best = v_list[ind]
    v_list.remove(variable_best)
    data_update = data[data[variable_best] == 0]
    
    # find the second rule
    
    data_selected = data[data[variable_best] == 1]
    n0 = data_selected.shape[0]
    
    if n0 == 0:
        variable_best_two = [variable_best, "no observations"]
        return [variable_best_two, v_list, data_update]
    
    p0 = data_selected[data_selected['outcome'] == 1].shape[0]/n0
    score0 = p0*(1-p0)
    
    score = [1]*(v-1)
    for i in range(v-1):
        variable = v_list[i]
        
        v1c1 = data_selected[(data_selected[variable] == 1) & (data_selected['outcome'] == 1)].shape[0]
        v1c0 = data_selected[(data_selected[variable] == 1) & (data_selected['outcome'] == 0)].shape[0]
        if (v1c1+v1c0) == 0:
            p1 = 1/2
        else:
            p1 = v1c1/(v1c1+v1c0)
        v0c1 = data_selected[(data_selected[variable] == 0) & (data_selected['outcome'] == 1)].shape[0]
        v0c0 = data_selected[(data_selected[variable] == 0) & (data_selected['outcome'] == 0)].shape[0]
        if (v0c1+v0c0) == 0:
            p2 = 1/2
        else:
            p2 = v0c1/(v0c1+v0c0)
    
        # use gini index -- will improve specificity but great hurt sensitivity
        # score[i] = (v1c1+v1c0)/n0 * p1 * (1-p1) + (v0c1+v0c0)/n0 * p2 * (1-p2)  
        score[i] = p2
    
    min_score = min(score)
    if min_score > 0.1:
        variable_best_two = [variable_best, "no need"]
    else:
        ind = score.index(min_score)
        variable_best_two = [variable_best, v_list[ind]]
    
    return [variable_best_two, v_list, data_update]


def make_decision_ob(observation, v_list):
    
    '''
    make decision by v_list with two columns
    '''
    n = len(v_list)
    for i in range(n):
        
        v0 = v_list[i][0]
        v1 = v_list[i][1]
        
        if (observation[v0].item() == 1):
            if (v1 in ["no need", 'no observations']):
                return 1
            elif (observation[v1].item() == 1):
                return 1
            
    return 0
        

def make_decision_data(data, v_list):
    
    n = data.shape[0]
    decision = [0]*n
    for i in range(n):
        observation = data.iloc[[i]]
        decision[i] = make_decision_ob(observation, v_list)
    return decision

def evaluate_vlist(data, v_list, method = 'one'):
    
    data0 = pd.DataFrame({'outcome': data['outcome']})
    
    if method == "one":
        indicator = data[v_list].sum(axis = 1)
        data0['pred'] = [1 if (i > 0) else 0 for i in indicator]
    if method == "two":
        data0['pred'] = make_decision_data(data, v_list)

    TN = data0[ (data0['pred'] == 0) & (data0['outcome'] == 0)].shape[0]
    TP = data0[ (data0['pred'] == 1) & (data0['outcome'] == 1)].shape[0]
    FN = data0[ (data0['pred'] == 0) & (data0['outcome'] == 1)].shape[0]
    FP = data0[ (data0['pred'] == 1) & (data0['outcome'] == 0)].shape[0]
    
    sensitivity = TP/(FN+TP)
    specificity = TN/(FP+TN)
    
    return [sensitivity, specificity]
    

In [None]:
data = avdf_ems_train
v_list = list(data.columns)
# v_list.remove('outcome')
result = find_best_two(data,v_list)
result[0]

In [None]:
data = avdf_all_train
v_list = list(data.columns)
v_list.remove('outcome')
variable_rank_ems = []
while len(v_list) > 0:
    result = find_best_two(data,v_list)
    variable_rank_ems.append(result[0])
    v_list = result[1]
    data = result[2]

In [None]:
variable_rank_ems[0:8]

In [None]:
data = avdf_all_train
evaluate_vlist(data,variable_rank_ems[0:8], method = 'two')

In [None]:
v_list = variable_rank_ems
data = avdf_all_train
l = len(v_list)
ind = range(l)
sens = [0]*l
spec = [0]*l
for i in ind:
    r = evaluate_vlist(data,v_list[0:i], method = 'two')
    sens[i] = r[0]
    spec[i] = r[1]

d = {'num': ind, 'sensitivity': sens, 'specificity': spec}
evaluation = pd.DataFrame(data = d)
fig, ax = plt.subplots(figsize=(6,4))
sns.lineplot(x='sensitivity', y= 'specificity', data = evaluation, marker='d')

In [None]:
evaluate_vlist(avdf_all_train, Leonard)

In [None]:
variable_rank_ems

In [None]:
data = avdf_ems_train
v_list = list(data.columns)
v_list.remove('outcome')
variable_rank_ems = []
while len(v_list) > 0:
    result = find_best(data,v_list)
    variable_rank_ems.append(result[0])
    v_list = result[1]
    data = result[2]

In [None]:
data = avdf_moi_train
v_list = list(data.columns)
v_list.remove('outcome')
variable_rank_moi = []
while len(v_list) > 0:
    result = find_best(data,v_list)
    variable_rank_moi.append(result[0])
    v_list = result[1]
    data = result[2]

In [None]:
variable_rank_ems[0:8] 

In [None]:
variable_rank_moi[0:8] 

In [None]:
Leonard

In [None]:
evaluate_vlist(avdf_ems_train, Leonard)

In [None]:
evaluate_vlist(avdf_ran_train, variable_rank_moi[0:8])

In [None]:
evaluate_vlist(avdf_ran_tuning, Leonard)

In [None]:
evaluate_vlist(avdf_ran_tuning, variable_rank_moi[0:8])

In [None]:
data = avdf_ran_train
v_list = list(data.columns)
v_list.remove('outcome')
variable_rank = []
while len(v_list) > 0:
    result = find_best(data,v_list,method = 'semi_gini')
    variable_rank.append(result[0])
    v_list = result[1]
    data = result[2]

In [None]:
print(evaluate_vlist(avdf_ran_train, Leonard))
print(evaluate_vlist(avdf_ran_train, variable_rank[0:11]))

In [None]:
print(evaluate_vlist(avdf_ran_tuning, Leonard))
print(evaluate_vlist(avdf_ran_tuning, variable_rank[0:11]))

In [None]:
variable_rank[0:11]

In [None]:
Leonard