In [21]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sys 
import os

# this line walks up file directory so rule-vetting is cwd
# Currently this breaks if chunk is run multiple times
os.chdir(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))))
print(os.getcwd())

import rulevetting.api.viz as viz
from rulevetting.projects.csi_pecarn.dataset import Dataset
from rulevetting.projects.csi_pecarn.tree_functions import *
from rulevetting.projects.csi_pecarn.baseline import Baseline
from rulevetting.projects.csi_pecarn.model_best import SpecialTree

from sklearn import tree
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import RocCurveDisplay

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/


In [22]:
def classify_experiment_results(experiment_df):
    # compute confusion matrix entries
    TN = experiment_df[ (experiment_df['pred'] == 0) & (experiment_df['csi_injury'] == 0)].shape[0]
    TP = experiment_df[ (experiment_df['pred'] == 1) & (experiment_df['csi_injury'] == 1)].shape[0]
    FN = experiment_df[ (experiment_df['pred'] == 0) & (experiment_df['csi_injury'] == 1)].shape[0]
    FP = experiment_df[ (experiment_df['pred'] == 1) & (experiment_df['csi_injury'] == 0)].shape[0]
    sensitivity = TP/(FN+TP)
    specificity = TN/(FP+TN)    
        
    return (sensitivity, specificity)

In [25]:
def run_experiment(data_train,data_test, description=''):

    results_dict = {}

    #####
    # SpecialTree on training+tuning

    pred_train = SpecialTree(data_train).predict()
    evaluate = pd.DataFrame(data = {'csi_injury': data_train['csi_injury'], 
                                    'pred': pred_train})
    st_results = classify_experiment_results(evaluate)
    results_dict['ST_train'] = (st_results[0],st_results[1])
    
    #####
    # Baseline on training+tuning

    pred_train = Baseline(data_train).predict()
    evaluate = pd.DataFrame(data = {'csi_injury': data_train['csi_injury'], 
                                    'pred': pred_train})
    bt_results = classify_experiment_results(evaluate)
    results_dict['B_train'] = (bt_results[0],bt_results[1])
    #####
    # SpecialTree on on unseen testing

    pred_test = SpecialTree(data_test).predict()
    evaluate = pd.DataFrame(data = {'csi_injury': data_test['csi_injury'], 
                                    'pred': pred_test})
    stt_results = classify_experiment_results(evaluate)
    results_dict['ST_test'] = (stt_results[0],stt_results[1])
    #####
    # Baseline on unseen testing

    pred_test = Baseline(data_test).predict()
    evaluate = pd.DataFrame(data = {'csi_injury': data_test['csi_injury'], 
                                    'pred': pred_test})
    btt_results = classify_experiment_results(evaluate)
    results_dict['B_test'] = (btt_results[0],btt_results[1])
    
    return results_dict

In [26]:
kappa_dictionary = Dataset().get_data(control_types=['ran'],run_perturbations=True)

100%|██████████| 12/12 [00:00<00:00, 65.67it/s]

read all the csvs...
 ['analysisvariables.csv', 'clinicalpresentationfield.csv', 'clinicalpresentationoutside.csv', 'clinicalpresentationsite.csv', 'demographics.csv', 'injuryclassification.csv', 'injurymechanism.csv', 'kappa.csv', 'medicalhistory.csv', 'radiologyoutside.csv', 'radiologyreview.csv', 'radiologysite.csv']



100%|██████████| 12/12 [00:00<00:00, 61.59it/s]

119 Raw Covariates Selected
read all the csvs...
 ['analysisvariables.csv', 'clinicalpresentationfield.csv', 'clinicalpresentationoutside.csv', 'clinicalpresentationsite.csv', 'demographics.csv', 'injuryclassification.csv', 'injurymechanism.csv', 'kappa.csv', 'medicalhistory.csv', 'radiologyoutside.csv', 'radiologyreview.csv', 'radiologysite.csv']





Index(['PtAmbulatoryPriorArrival', 'GCSEye', 'VerbalGCS', 'MotorGCS',
       'TotalGCS', 'AVPUDetails', 'PtCompPainHead', 'PtCompPainFace',
       'PtCompPainNeck', 'PtCompPainNeckMove', 'PtCompPainChest',
       'PtTenderHead', 'PtTenderFace', 'PtTenderNeck', 'PassRestraint',
       'Assault', 'ChildAbuse', 'FallDownStairs'],
      dtype='object')
119 Raw Covariates Selected


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  else:


split_data kwargs {'control_types': ['ran']}
['ran']
split_data kwargs {'control_types': ['ran']}
['ran']


In [4]:
kappa_units = [110002, 110015, 110017, 120042, 120043, 120049, 130056, 130059, 130064, 130065, 130078, 140109, 140112, 140131, 140142, 140145, 210160, 210165, 220175, 220176, 220179, 220186, 220193, 230206, 230218, 230222, 230225, 240241, 240246, 240250, 240252, 310274, 310280, 310282, 310290, 310295, 320330, 320334, 320338, 320339, 330342, 330350, 330351, 330371, 330379, 330386, 340398, 340399, 340400, 340408, 340414, 340420, 340422, 340437, 340459, 410464, 410474, 410483, 410488, 420495, 420500, 420503, 420511, 420516, 430546, 430553, 430566, 430567, 430587, 440590, 440602, 440612, 440620, 440636, 510649, 510650, 520656, 520663, 530675, 530678, 540686, 540696, 540701, 610707, 610716, 610724, 620739, 620741, 620756, 620757, 630766, 630771, 630787, 630803, 630806, 640807, 640809, 640839, 640847, 710853, 710862, 710871, 710876, 710879, 720903, 720908, 720934, 720939, 720940, 720944, 730966, 730974, 730977, 730981, 730984, 730988, 730990, 731027, 731028, 741040, 741044, 741054, 741061, 741064, 741066, 741068, 741086, 741088, 811121, 811122, 811137, 811140, 811154, 821178, 821181, 821192, 821194, 821207, 821216, 821229, 831251, 831252, 831253, 831256, 831261, 831263, 831265, 831278, 831295, 831312, 831345, 831353, 841370, 841375, 841398, 841442, 841443, 841444, 841445, 841446, 841459, 841460, 841463, 911477, 911484, 911489, 921498, 921500, 921505, 931522, 931546, 931548, 931552, 931560, 941572, 941576, 941580, 941586, 941588, 1011610, 1011613, 1011623, 1011624, 1021649, 1021650, 1021658, 1021669, 1021676, 1021681, 1031695, 1031719, 1031720, 1031734, 1031739, 1031745, 1031752, 1031756, 1041765, 1041767, 1041782, 1041803, 1041807, 1041809, 1041823, 1041831, 1111852, 1111853, 1111854, 1111860, 1121876, 1121885, 1121888, 1121896, 1121916, 1121929, 1121932, 1131955, 1131958, 1131960, 1131971, 1131980, 1131984, 1131985, 1142008, 1142018, 1142041, 1142045, 1142048, 1142054, 1142062, 1142063, 1212083, 1212085, 1222104, 1222105, 1232122, 1232133, 1232134, 1242151, 1242163, 1242165, 1312171, 1312175, 1312185, 1312201, 1322210, 1322221, 1322229, 1322232, 1322242, 1332262, 1332267, 1332272, 1332274, 1332283, 1332291, 1342318, 1342319, 1342345, 1342350, 1342355, 1342363, 1412364, 1412372, 1412383, 1422391, 1422399, 1432404, 1432420, 1432426, 1432429, 1432435, 1442449, 1442457, 1442458, 1442471, 1442476, 1512486, 1512489, 1512508, 1512524, 1512528, 1522547, 1522557, 1522560, 1522570, 1522573, 1522582, 1532597, 1532606, 1532607, 1532615, 1532618, 1532621, 1532623, 1532644, 1532676, 1532678, 1542690, 1542692, 1542700, 1542702, 1542736, 1542749, 1542751, 1542755, 1542756, 1612781, 1612794, 1612799, 1612803, 1612809, 1612820, 1622826, 1622852, 1622856, 1622858, 1622870, 1622880, 1622881, 1632894, 1632913, 1632914, 1632923, 1632924, 1632933, 1632952, 1632954, 1632963, 1632968, 1632981, 1643006, 1643013, 1643020, 1643024, 1643046, 1643055, 1643076, 1643091, 1643098, 1713099, 1713101, 1713119, 1713126, 1723139, 1723148, 1723151, 1723161, 1723162, 1723166, 1733188, 1733198, 1733208, 1733213, 1733221, 1733230, 1733238, 1743255, 1743266, 1743268, 1743281, 1743290, 1743292, 1743294, 1743308]

In [5]:
# rename keys by kappa status
for i, key in enumerate(kappa_dictionary.keys()):
    kappa_dictionary['Kappa_'+str(i)] = kappa_dictionary.pop(key)

In [6]:
kappa_dictionary.keys()

dict_keys(['Kappa_0', 'Kappa_1', 'Kappa_2'])

In [7]:
no_kappa = kappa_dictionary['Kappa_1']
kappa = kappa_dictionary['Kappa_1']

In [20]:
kappa_units_test = len(set(no_kappa[2].reset_index()['id']).intersection(set(kappa_units)))
kappa_units_train = len(kappa_units) - kappa_units_test
# fraction in test/train reabstracted
print(kappa_units_train,kappa_units_test)
print((kappa_units_train)/(no_kappa[0].shape[0]+no_kappa[1].shape[0]),kappa_units_test/no_kappa[2].shape[0])

333 32
0.30272727272727273 0.1103448275862069


In [10]:
no_kappa_train = pd.concat([no_kappa[0],no_kappa[1]])
no_kappa_test = pd.concat([no_kappa[0],no_kappa[1]])

kappa_train = pd.concat([kappa[0],kappa[1]])
kappa_test = pd.concat([kappa[2]])

In [17]:
no_kappa_results = run_experiment(no_kappa_train,no_kappa_test)
kappa_result = run_experiment(kappa_train,kappa_test)

In [18]:
no_kappa_results

{'ST_train': (0.9683377308707124, 0.36893203883495146),
 'B_train': (0.920844327176781, 0.40221914008321774),
 'ST_test': (0.9683377308707124, 0.36893203883495146),
 'B_test': (0.920844327176781, 0.40221914008321774)}

In [19]:
kappa_result

{'ST_train': (0.9683377308707124, 0.36893203883495146),
 'B_train': (0.920844327176781, 0.40221914008321774),
 'ST_test': (0.912621359223301, 0.3422459893048128),
 'B_test': (0.9611650485436893, 0.42245989304812837)}