In [1]:
%load_ext autoreload
%autoreload 2

import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#IMPORTANT: REPLACE WITH YOUR PATH TO THE RULE-VETTING GITHUB
repo_path = '/Users/zhouxin/study/stat215/lab5_git/group/rule-vetting/'

sys.path.insert(1, repo_path)

import rulevetting.api.viz as viz
from rulevetting.projects.tbi_pecarn.dataset import Dataset

outcome_def = 'outcome'  # output
%matplotlib inline
%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load cleaned data

In [57]:
# this is me loading up the main CSV on my own
tbi_df = pd.read_csv(repo_path + '/rulevetting/projects/tbi_pecarn/notebooks/clean_dataset_11_24.csv',
                    index_col = 0)
tbi_df.index = tbi_df.PatNum.copy()

In [101]:
tbi_df_young = tbi_df[tbi_df['AgeinYears'] < 2]
tbi_df_old = tbi_df[tbi_df['AgeinYears'] >= 2]

# Baseline model for age < 2

In [73]:
df_features = tbi_df_young.copy()
predicted_probabilities = pd.Series(index=df_features.index, dtype=float) # for each patient id
df = df_features.copy()
o = 'PosIntFinal' # outcome variable name

In [74]:
# original proportion in percent
df[o].sum() / df.shape[0] * 100

0.7000000000000001

In [75]:
# step 1
query = 'AMS == 1'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

3.634361233480176


In [86]:
# step 2
query = 'HemaLoc == [2, 3]'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

1.5202702702702704


In [88]:
# step 3
query = 'LocLen == [2, 3, 4]'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

1.4084507042253522


In [90]:
# step 4
query = 'High_impact_InjSev == 3'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.3105590062111801


In [92]:
# step 5
query = 'SFxPalp == 1'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

3.3707865168539324


In [95]:
# step 6
query = 'ActNorm == 0'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.2762430939226519


In [96]:
# step 7
query = 'GCSTotal >= 0' # always true
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.020177562550443905


In [98]:
self_rules = [
            ('AMS == 1', 3.6),
            ('HemaLoc == [2, 3]', 1.5),
            ('LocLen == [2, 3, 4]', 1.4),
            ('High_impact_InjSev == 3', 0.31),
            ('SFxPalp == 1', 3.4),
            ('ActNorm == 0', 0.28),

            # final condition is just something that is always true
            ('GCSTotal >= 0', 0.02),
        ]

In [100]:
df_features = tbi_df_young.copy()
str_print = f''
predicted_probabilities = pd.Series(index=df_features.index, dtype=float)
df = df_features.copy()
o = 'PosIntFinal'
str_print += f'{df[o].sum()} / {df.shape[0]} (positive class / total)\n\t\u2193 \n'
for j, rule in enumerate(self_rules):
    query, prob = rule
    df_rhs = df.query(query)
    idxs_satisfying_rule = df_rhs.index
    predicted_probabilities.loc[idxs_satisfying_rule] = prob

    df.drop(index=idxs_satisfying_rule, inplace=True)
    computed_prob = 100 * df_rhs[o].sum() / df_rhs.shape[0]
    query_print = query.replace(' == 1', '')
    if j < len(self_rules) - 1:
        str_print += f'\033[96mIf {query_print:<35}\033[00m \u2192 {df_rhs[o].sum():>3} / {df_rhs.shape[0]:>4} ({computed_prob:0.1f}%)\n\t\u2193 \n   {df[o].sum():>3} / {df.shape[0]:>5}\t \n'
predicted_probabilities = predicted_probabilities.values
self_str_print = str_print
print(self_str_print)

63.0 / 9000 (positive class / total)
	↓ 
[96mIf AMS                                [00m → 33.0 /  908 (3.6%)
	↓ 
   30.0 /  8092	 
[96mIf HemaLoc == [2, 3]                  [00m → 18.0 / 1184 (1.5%)
	↓ 
   12.0 /  6908	 
[96mIf LocLen == [2, 3, 4]                [00m → 3.0 /  213 (1.4%)
	↓ 
   9.0 /  6695	 
[96mIf High_impact_InjSev == 3            [00m → 4.0 / 1288 (0.3%)
	↓ 
   5.0 /  5407	 
[96mIf SFxPalp                            [00m → 3.0 /   89 (3.4%)
	↓ 
   2.0 /  5318	 
[96mIf ActNorm == 0                       [00m → 1.0 /  362 (0.3%)
	↓ 
   1.0 /  4956	 



# Baseline model for age >= 2

In [116]:
df_features = tbi_df_old.copy()
predicted_probabilities = pd.Series(index=df_features.index, dtype=float) # for each patient id
df = df_features.copy()
o = 'PosIntFinal' # outcome variable name

In [117]:
# original proportion in percent
df[o].sum() / df.shape[0] * 100

0.4744224422442244

In [118]:
# step 1
query = 'AMS == 1'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

2.570480928689884


In [119]:
# step 2
query = 'LOCSeparate == [1, 2]'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.8200258955545966


In [120]:
# step 3
query = 'Vomit == 1'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.87527352297593


In [121]:
# step 4
query = 'High_impact_InjSev == 3'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.20811654526534862


In [122]:
# step 5
query = 'SFxBas == 1'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

5.714285714285714


In [123]:
# step 6
query = 'HASeverity == 3'
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

1.0471204188481675


In [124]:
# step 7
query = 'GCSTotal >= 0' # always true
df_rhs = df.query(query)
idxs_satisfying_rule = df_rhs.index
# proportion in percent
prob = df_rhs[o].sum() / df_rhs.shape[0] * 100
predicted_probabilities.loc[idxs_satisfying_rule] = prob
df.drop(index=idxs_satisfying_rule, inplace=True)
print(prob)

0.05161290322580645


In [125]:
self_rules = [
            ('AMS == 1', 2.6),
            ('LOCSeparate == [1, 2]', 0.82),
            ('Vomit == 1', 0.88),
            ('High_impact_InjSev == 3', 0.21),
            ('SFxBas == 1', 5.7),
            ('HASeverity == 3', 1.0),

            # final condition is just something that is always true
            ('GCSTotal >= 0', 0.05),
        ]

In [126]:
df_features = tbi_df_old.copy()
str_print = f''
predicted_probabilities = pd.Series(index=df_features.index, dtype=float)
df = df_features.copy()
o = 'PosIntFinal'
str_print += f'{df[o].sum()} / {df.shape[0]} (positive class / total)\n\t\u2193 \n'
for j, rule in enumerate(self_rules):
    query, prob = rule
    df_rhs = df.query(query)
    idxs_satisfying_rule = df_rhs.index
    predicted_probabilities.loc[idxs_satisfying_rule] = prob

    df.drop(index=idxs_satisfying_rule, inplace=True)
    computed_prob = 100 * df_rhs[o].sum() / df_rhs.shape[0]
    query_print = query.replace(' == 1', '')
    if j < len(self_rules) - 1:
        str_print += f'\033[96mIf {query_print:<35}\033[00m \u2192 {df_rhs[o].sum():>3} / {df_rhs.shape[0]:>4} ({computed_prob:0.1f}%)\n\t\u2193 \n   {df[o].sum():>3} / {df.shape[0]:>5}\t \n'
predicted_probabilities = predicted_probabilities.values
self_str_print = str_print
print(self_str_print)

115.0 / 24240 (positive class / total)
	↓ 
[96mIf AMS                                [00m → 62.0 / 2412 (2.6%)
	↓ 
   53.0 / 21828	 
[96mIf LOCSeparate == [1, 2]              [00m → 19.0 / 2317 (0.8%)
	↓ 
   34.0 / 19511	 
[96mIf Vomit                              [00m → 16.0 / 1828 (0.9%)
	↓ 
   18.0 / 17683	 
[96mIf High_impact_InjSev == 3            [00m → 4.0 / 1922 (0.2%)
	↓ 
   14.0 / 15761	 
[96mIf SFxBas                             [00m → 4.0 /   70 (5.7%)
	↓ 
   10.0 / 15691	 
[96mIf HASeverity == 3                    [00m → 2.0 /  191 (1.0%)
	↓ 
   8.0 / 15500	 



In [16]:
df.keys().tolist()

['Unnamed: 0',
 'PatNum',
 'High_impact_InjSev',
 'Amnesia_verb',
 'LOCSeparate',
 'LocLen',
 'Seiz',
 'SeizLen',
 'ActNorm',
 'HA_verb',
 'HASeverity',
 'Vomit',
 'GCSEye',
 'GCSVerbal',
 'GCSMotor',
 'GCSTotal',
 'AMS',
 'AMSAgitated',
 'AMSSleep',
 'AMSSlow',
 'AMSRepeat',
 'AMSOth',
 'SFxPalp',
 'SFxPalpDepress',
 'FontBulg',
 'SFxBas',
 'SFxBasHem',
 'SFxBasOto',
 'SFxBasPer',
 'SFxBasRet',
 'SFxBasRhi',
 'Hema',
 'HemaLoc',
 'HemaSize',
 'Clav',
 'ClavFace',
 'ClavNeck',
 'ClavFro',
 'ClavOcc',
 'ClavPar',
 'ClavTem',
 'NeuroD',
 'NeuroDMotor',
 'NeuroDSensory',
 'NeuroDCranial',
 'NeuroDReflex',
 'NeuroDOth',
 'OSI',
 'OSIExtremity',
 'OSICut',
 'OSICspine',
 'OSIFlank',
 'OSIAbdomen',
 'OSIPelvis',
 'OSIOth',
 'AgeinYears',
 'Gender',
 'Race',
 'DeathTBI',
 'HospHead',
 'Intub24Head',
 'Neurosurgery',
 'PosIntFinal',
 'PosIntFinalNoHosp']