Fit interpretable models to the training set and test on validation sets.

In [1]:
#%matplotlib inline
#%load_ext autoreload
#%autoreload 2

import os
import pickle as pkl
from os.path import join as oj
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

import imodels
from rulevetting.api import validation_new
from rulevetting.projects.csi_pecarn.dataset_sh_1206 import Dataset

MODELS_DIR = './models'
os.makedirs(MODELS_DIR, exist_ok=True)

outcome_def = 'outcome'  # output

from rulevetting.projects.csi_pecarn.model_helper import var_selection, predict_and_save, model_valid, fit_simple_models, fit_other_models, plot_metrics, print_metrics 



In [2]:
df_train, df_tune, df_test = Dataset().get_data(load_csvs=True)
print(df_train.shape)

(1988, 45)


In [3]:
X_train = df_train.drop(columns=outcome_def)
y_train = df_train[outcome_def].values
X_tune = df_tune.drop(columns=outcome_def)
y_tune = df_tune[outcome_def].values
X_test = df_test.drop(columns=outcome_def)
y_test = df_test[outcome_def].values

## check baseline model

In [4]:
def check_baseline_model(X, y,
                         original_8_vars = ['AlteredMentalStatus', 'FocalNeuroFindings', 'PainNeck', 'SubInj_TorsoTrunk', 'Torticollis',
                                            'Predisposed', 'HighriskDiving', 'HighriskMVC']
                        ):
    
    # setting
    data = X[original_8_vars]
    n = data.shape[0]

    # construct a df
    check = pd.DataFrame()
    check["real"] = y.astype(str)
    pred = np.array((data[original_8_vars].sum(axis = 1) != 0).astype(int).astype(str))
    check["pred"] = pred
    
    # count & calculate sens. and spec.
    non_csi = check.groupby(["real"]).size()["0"]
    csi = check.groupby(["real"]).size()["1"]
    # print("total:", n, "/ csi:", csi, "/ non_csi:", non_csi)
    counts = check.groupby(["real", "pred"]).size()
    sens = counts["1"]["1"] / csi
    spec = counts["0"]["0"] / non_csi
    # print("sens: ", sens, "/ spec:", spec)
    
    return sens, spec, n, counts

In [5]:
baseline_result = {'train':{}, 'tune':{}, 'test':{}}

In [6]:
for x, y, suffix in zip([X_train, X_tune, X_test],
                        [y_train, y_tune, y_test],
                        ['train', 'tune', 'test']):
    sens, spec, n, counts = check_baseline_model(x, y)
    baseline_result[suffix]["sensitivity"] = sens
    baseline_result[suffix]["specificity"] = spec
    baseline_result[suffix]["total patients"] = n
    baseline_result[suffix]["counts"] = counts

In [7]:
for suffix in ['train', 'tune', 'test']:
    print(suffix, "/// sens: ", baseline_result[suffix]["sensitivity"], "/ spec:", baseline_result[suffix]["specificity"])

train /// sens:  0.9029126213592233 / spec: 0.42167957117331745
tune /// sens:  0.9439252336448598 / spec: 0.3902877697841727
test /// sens:  0.9435483870967742 / spec: 0.3877551020408163
