# Capstone Project

## Modeling Notebook - `PREPOST_dataset_COMBO` from 11/14/19

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import clone

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# scikit-learn libraries for feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFECV

# scikit-learn libraries for learning
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, cross_validate, cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.9 |Anaconda custom (x86_64)| (default, Jul 30 2019, 13:42:17) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.2
Running numpy version: 1.17.2
Running sklearn version: 0.21.3


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.Capstone - STS risk factor list.xlsx.icloud',
 '.DS_Store',
 '.capstone_STS_risk_factor_features.xlsx.icloud',
 '.capstone_data-version-2.xlsx.icloud',
 '.capstone_data.xlsx.icloud',
 '.capstone_data_binarized_outcome.xlsx.icloud',
 '.capstone_data_filled_in_complication_data.xlsx.icloud',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'POSTOP_ALL_col_names_11_14.pkl',
 'POSTOP_ALL_col_names_11_9.pkl',
 'POSTOP_TREE_ALL_col_names_11_14.pkl',
 'POSTOP_TREE_ALL_col_names_11_9.pkl',
 'POSTOP_categorical_TREE_col_names_11_14.pkl',
 'POSTOP_categorical_TREE_col_names_11_9.pkl',
 'POSTOP_categorical_col_names_11_14.pkl',
 'POSTOP_categorical_col_names_11_9.pkl',
 'POSTOP_numerical_col_names_11_14.pkl',
 'POSTOP_numerical_col_names_11_9.pkl',
 'PREOP_ALL_col_names_11_14.pkl',
 'PREOP_ALL_col_names_11_9.pkl',
 'PREOP_TREE_ALL_col_names_11_14.pkl',
 'PREOP_TREE

## Preprocessing Unprocessed Dataset

### Loading Dataset

#### `PREPOST_dataset_COMBO_11_14`

In [27]:
data = pd.read_pickle('../data/PRE_plus_POST_dataset_cabg.pkl')

In [28]:
data.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0.017,2,0,1,0,0,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0.045,2,0,1,0,0,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
data.shape

(28945, 234)

### Dropping Irrelevant Columns

- `recordId`
- `cnstrokp`
- `cnstrokttia`
- `cncomaenceph`
- `strokeBin` as we are going to use the more inclusive `strokeBin2` as our outcome variable

#### Relative to prior versions, we are also going to retain `predstro` that will be used to compare our models versus the `STS` Model

In [30]:
cols_to_drop = ['recordId',
                'cnstrokp',
                'cnstrokttia',
                'cncomaenceph',
                'strokeBin']

In [31]:
len(cols_to_drop)

5

- dropping columns

In [32]:
data = data.drop(cols_to_drop, axis=1)

In [33]:
data.shape

(28945, 229)

- resetting `DataFrame` `index` 

In [34]:
data = data.reset_index(drop=True)

In [35]:
data.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,predstro,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.017,0,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.045,0,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.016,0,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,72,162.60001,104.5,39.5253,40.0,0.9,3.4,5.4,6.4,40.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.009,0,,0.0,0.0,,,,,,28.0,35.1,65.0,1.2,,,45.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,57,162.60001,114.3,43.23198,36.0,1.0,3.6,6.9,6.6,60.0,22.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.006,0,,0.0,0.0,,0.0,0.0,0.0,2.0,18.0,31.0,95.0,1.0,,,75.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
data.tail()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,predstro,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
28940,69,167.60001,84.5,30.08213,35.6,0.9,3.4,11.6,6.4,38.0,,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.013,0,,0.0,0.0,,,,,,29.0,36.1,,0.8,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28941,62,182.89999,98.3,29.38503,43.3,0.9,4.1,5.5,6.4,50.0,33.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.007,0,,0.0,0.0,,,,,,30.0,34.9,77.0,0.8,,,66.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28942,66,175.3,75.3,24.50367,46.2,0.83,3.8,5.3,7.47,58.0,,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.007,0,,0.0,0.0,,,,,,41.0,34.6,,0.8,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28943,62,165.10001,107.5,39.43794,46.1,0.77,3.8,5.3,7.47,55.0,,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.008,0,,0.0,0.0,,,,,,32.0,35.3,72.0,0.7,,,54.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28944,78,180.3,87.5,26.91638,40.4,1.14,3.8,5.8,8.73,50.0,33.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.009,1,,0.0,0.0,,,,,,28.0,35.6,100.0,1.5,,,57.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
data.shape

(28945, 229)

### Feature Matrix `X`
- in the next cell you can select which outcome vector you want to use: `strokeBin` or `strokeBin2`

In [38]:
X = data.copy().drop('strokeBin2', axis=1)

In [39]:
X = X.reset_index(drop=True)

In [40]:
X.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,predstro,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.017,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.045,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
X.shape

(28945, 228)

### Target Vector `y`

In [42]:
y = data.copy()['strokeBin2']

In [43]:
type(y)

pandas.core.series.Series

In [44]:
y.head(2)

0    0
1    0
Name: strokeBin2, dtype: int64

In [45]:
y.shape

(28945,)

In [46]:
y.unique()

array([0, 1])

In [47]:
y.value_counts()

0    28523
1      422
Name: strokeBin2, dtype: int64

### `X_train_all`, `X_devtest`, `y_train`, `y_devtest`
- using `train_test_split` with `stratify` parameter to ensure relative proportion of outcome classes are the same in `train`, `dev` and `test` sets
- observation split will be `80/10/10` between `train`, `dev`, `test`
- designation of `_all` denotes complete feature set (`PRE` + `POST`)

In [48]:
X_train_all, X_devtest_all, y_train, y_devtest = train_test_split(X,
                                                                  y,
                                                                  test_size=0.2,
                                                                  random_state=0,
                                                                  stratify=y)

#### validating `train_test_split`

In [49]:
X.shape

(28945, 228)

In [50]:
np.rint(X.shape[0] * 0.20)

5789.0

In [51]:
X_train_all.shape, X_devtest_all.shape, y_train.shape, y_devtest.shape

((23156, 228), (5789, 228), (23156,), (5789,))

In [52]:
X.shape[0] - X_train_all.shape[0] - X_devtest_all.shape[0]

0

In [53]:
y.shape[0] - y_train.shape[0] - y_devtest.shape[0]

0

#### validating `stratify` worked
- relative proportion of classes in `y`

In [54]:
print (np.round(y.value_counts()[0] / y.shape[0], 4))
print (np.round(y.value_counts()[1] / y.shape[0], 4))

0.9854
0.0146


- relative proportion of classes in `y_train`

In [55]:
print (np.round(y_train.value_counts()[0] / y_train.shape[0], 4))
print (np.round(y_train.value_counts()[1] / y_train.shape[0], 4))

0.9854
0.0146


- relative proportion of classes in `y_devtest`

In [56]:
print (np.round(y_devtest.value_counts()[0] / y_devtest.shape[0], 4))
print (np.round(y_devtest.value_counts()[1] / y_devtest.shape[0], 4))

0.9855
0.0145


### `X_dev_all`, `X_test_all`, `y_dev`, `y_test`

In [57]:
X_dev_all, X_test_all, y_dev, y_test = train_test_split(X_devtest_all,
                                                        y_devtest,
                                                        test_size=0.5,
                                                        random_state=0,
                                                        stratify=y_devtest)

- validating `train_test_split`

In [58]:
X_devtest_all.shape

(5789, 228)

In [59]:
np.rint(X_devtest_all.shape[0] * 0.50)

2894.0

In [60]:
X_dev_all.shape, X_test_all.shape, y_dev.shape, y_dev.shape

((2894, 228), (2895, 228), (2894,), (2894,))

#### validating `stratify` worked
- relative proportion of classes in `y`

In [61]:
print (np.round(y.value_counts()[0] / y.shape[0], 4))
print (np.round(y.value_counts()[1] / y.shape[0], 4))

0.9854
0.0146


- relative proportion of classes in `y_devtest`

In [62]:
print (np.round(y_devtest.value_counts()[0] / y_devtest.shape[0], 4))
print (np.round(y_devtest.value_counts()[1] / y_devtest.shape[0], 4))

0.9855
0.0145


- relative proportion of classes in `y_dev`

In [63]:
print (np.round(y_dev.value_counts()[0] / y_dev.shape[0], 4))
print (np.round(y_dev.value_counts()[1] / y_dev.shape[0], 4))

0.9855
0.0145


- relative proportion of classes in `y_test`

In [64]:
print (np.round(y_test.value_counts()[0] / y_test.shape[0], 4))
print (np.round(y_test.value_counts()[1] / y_test.shape[0], 4))

0.9855
0.0145


### Resetting Indicies

In [65]:
X_train_all = X_train_all.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [66]:
X_dev_all = X_dev_all.reset_index(drop=True)
y_dev = y_dev.reset_index(drop=True)

In [67]:
X_test_all = X_test_all.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Extracting `predstro` from `X_train`, `X_dev` and `X_test`
- and subsequently creating `predstro_train`, `predstro_dev`, `predstro_test`

- `train`

In [68]:
X_train_all.shape

(23156, 228)

In [69]:
predstro_train = X_train_all['predstro']

In [70]:
predstro_train.shape

(23156,)

In [71]:
X_train_all = X_train_all.drop('predstro', axis=1)

In [72]:
X_train_all.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,57,177.8,130.10001,41.15417,31.0,1.0,3.9,7.3,7.5,50.0,42.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,3.0,0.0,0.0,22.0,34.0,106.0,1.2,73.0,75.0,93.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,2.0,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,84,157.39999,44.0,17.76001,37.4,0.71,3.5,5.9,6.62,52.0,,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,0.0,0.0,2.0,24.0,34.4,70.0,0.8,,,62.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [73]:
X_train_all.shape

(23156, 227)

- `dev`

In [74]:
X_dev_all.shape

(2894, 228)

In [75]:
predstro_dev = X_dev_all['predstro']

In [76]:
predstro_dev.shape

(2894,)

In [77]:
X_dev_all = X_dev_all.drop('predstro', axis=1)

In [78]:
X_dev_all.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,65,178.0,115.0,36.29592,46.0,0.7,3.3,6.1,8.4,55.0,,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,,,,,34.0,32.0,98.0,0.8,,,67.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,61,190.5,149.7,41.25075,38.0,1.4,3.6,6.2,9.6,33.0,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,,,,,26.0,34.5,73.0,1.9,,,50.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,2.0,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [79]:
X_dev_all.shape

(2894, 227)

- `test`

In [80]:
X_test_all.shape

(2895, 228)

In [81]:
predstro_test = X_test_all['predstro']

In [82]:
predstro_test.shape

(2895,)

In [83]:
X_test_all = X_test_all.drop('predstro', axis=1)

In [84]:
X_test_all.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,72,188.0,133.0,37.63015,41.0,1.1,3.9,7.0,8.1,55.0,,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,27.0,35.0,141.0,1.3,,,123.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,75,173.0,88.0,29.40292,42.0,0.8,3.9,7.4,6.4,60.0,36.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,,,,,25.8,32.1,150.0,0.9,,,118.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [85]:
X_test_all.shape

(2895, 227)

- validating row count - `42,740` total observations

In [86]:
42740 - X_train_all.shape[0] - X_dev_all.shape[0] - X_test_all.shape[0]

13795

#### Last look at the data (`X_train_all`) before modeling

In [87]:
X_train_all.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,57,177.8,130.10001,41.15417,31.0,1.0,3.9,7.3,7.5,50.0,42.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,3.0,0.0,0.0,22.0,34.0,106.0,1.2,73.0,75.0,93.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,2.0,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,84,157.39999,44.0,17.76001,37.4,0.71,3.5,5.9,6.62,52.0,,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,0.0,0.0,2.0,24.0,34.4,70.0,0.8,,,62.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,76,193.0,85.0,22.8194,42.0,1.3,3.3,6.1,11.0,40.0,,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,0.0,10.0,0.0,26.0,33.0,280.0,2.4,,,212.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,73,177.8,107.0,33.84701,42.0,1.5,4.1,5.5,11.36,63.0,27.0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,0.0,0.0,4.0,0.0,22.0,35.1,77.0,2.0,,,51.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,0,0.0,1.0,2.0,1,0,,,0.0,,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,52,191.0,102.0,27.95976,41.0,0.6,3.0,12.0,8.4,20.0,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,,,,,32.0,34.0,109.0,0.7,,,58.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [88]:
X_train_all.tail()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,opocard,oponcard,unplao,unplvad,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,ocarvsd,opcab,opvalve,unplav,unplmv,vadproc,valexp2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
23151,66,175.2,134.7,43.88331,34.2,0.8,4.0,6.9,7.47,62.5,27.1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,,,,,24.0,34.1,99.0,0.8,,,56.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23152,78,173.0,57.0,19.04507,30.8,1.83,3.0,5.9,12.53,63.0,28.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,2.0,1.0,3.0,17.3,34.4,110.0,6.2,,,96.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,0.0,2.0,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23153,73,180.0,64.6,19.93827,47.7,2.0,5.2,8.0,,35.0,,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,,,,,33.4,33.2,98.0,0.8,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23154,57,149.0,67.4,30.35899,32.0,1.2,3.6,5.7,8.2,50.0,,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,0.0,0.0,,0.0,0.0,0.0,2.0,24.2,32.0,223.0,1.3,,,190.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
23155,53,175.0,113.0,36.89796,36.8,0.81,4.3,5.8,6.4,60.0,36.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,30.8,36.0,,0.8,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,,1,0,,,0.0,,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Pre-processing Data using `Scikit-Learn` `Pipelines`

### Creating a `FeatureSelector` transformer
- necessary because we are working with heterogeneous data (numerical and categorical features) -- want to be able to pick and choose which features (columns) to pass through our pipelines (and transform them) instead of having to pass through the whole dataframe

In [89]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    '''
    Transformer to select column from a data frame to perform additional transformations on
    Use this for selecting column(s) that require fit transform
    '''
    
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[data_dict.columns.intersection(self.key)]

## Loading Column Names for `PREOP` and `POSTOP` Features
- these `lists` of features will be used to mask/select specified features from the `COMBO` dataset to assemble `PREOP`, `PREOP_TREE`, `POSTOP` and `POSTOP_TREE` feature sets as appropriate
- these are essentially helper lists as `DataFrames` put through `Pipelines` emerge as `numpy` arrays which do not have column names
- can convert resulting `numpy` arrays back into `DataFrames` for further analysis

#### `PREOP_dataset` Column Names

- `DUMMIES` Version = `PREOP_numerical_col_names` + `PREOP_categorical_col_names`
- `TREE` Version    = `PREOP_numerical_col_names` + `PREOP_categorical_TREE_col_names`

- `PREOP_numerical_col_names`

In [91]:
with open('../data/PREOP_numerical_col_names_11_14.pkl', 'rb') as filehandle:
    PREOP_numerical_col_names = pickle.load(filehandle)

In [92]:
PREOP_numerical_col_names[0:5]

['age', 'heightcm', 'weightkg', 'bmi', 'hct']

In [93]:
len(PREOP_numerical_col_names)

11

- `PREOP_categorical_col_names`

In [96]:
with open('../data/PREOP_categorical_col_names_11_14.pkl', 'rb') as filehandle:
    PREOP_categorical_col_names = pickle.load(filehandle)

In [97]:
PREOP_categorical_col_names[0:5]

['surgdt_month_Jan',
 'surgdt_month_Feb',
 'surgdt_month_Mar',
 'surgdt_month_Apr',
 'surgdt_month_May']

In [98]:
len(PREOP_categorical_col_names)

99

- `PREOP_categorical_TREE_col_names`

In [100]:
with open('../data/PREOP_categorical_TREE_col_names_11_14.pkl', 'rb') as filehandle:
    PREOP_categorical_TREE_col_names = pickle.load(filehandle)

In [101]:
PREOP_categorical_TREE_col_names[0:5]

['surgdt_month',
 'surgdt_DayOfWeek',
 'surgdt_PartOfMonth',
 'gender',
 'racecaucasian']

In [102]:
len(PREOP_categorical_TREE_col_names)

61

#### `POSTOP_dataset` Column Names

- `DUMMIES` Version = `POSTOP_numerical_col_names` + `POSTOP_categorical_col_names`
- `TREE` Version = `POSTOP_numerical_col_names` + `POSTOP_categorical_TREE_col_names`

In [103]:
with open('../data/POSTOP_numerical_col_names_11_14.pkl', 'rb') as filehandle:
    POSTOP_numerical_col_names = pickle.load(filehandle)

In [104]:
with open('../data/POSTOP_categorical_col_names_11_14.pkl', 'rb') as filehandle:
    POSTOP_categorical_col_names = pickle.load(filehandle)

In [105]:
with open('../data/POSTOP_categorical_TREE_col_names_11_14.pkl', 'rb') as filehandle:
    POSTOP_categorical_TREE_col_names = pickle.load(filehandle)

## Functions Used in Pipelines

#### `SimpleImputer(missing_values=np.nan, strategy='median'` 
- to replace numerical feature `NaN`s with `X_train` numerical feature `NaN`s

#### `PolynomialFeatures()`
- per Albon 13.2 pages 225-227, can use `PolynomialFeatures()` to create interaction terms
- Parameter `interaction_only=True` tells `PolynomialFeatures()` to ONLY return interaction terms (and not polynomial features
- By default, `PolynomialFeatures()` will add a feature containing ones (a vector of `1`s) called a `bias` -- we can prevent that through the parameter `include_bias=False`
- The `degree` parameter determines the maximum number of features to create interaction terms from (in case we wanted to create an interaction term of `n_features`)
- open question whether we should apply `PolynomialFeatures` to the entire feature matrix or only to the numerical and/or categorical features and how we should do it (with everything higher degree + interaction terms or just interaction terms).
- Given the number of featues we have, we will have many more features after transforming with `PolynomialFeatures`
- The formula for calculating the number of the polynomial features is `N(n,d)=C(n+d,d)` where `n` is the number of the features, `d` is the degree of the polynomial, `C` is binomial coefficient(combination). In our case the number is `C(3+2,2)=5!/(5-2)!2!=10` but when the number of features or the degree is high the polynomial features becomes too many.

#### `sklearn.preprocessing` scalers such as `StandardScaler()`

#### Custom `helper` functions, such as `convert_df_to_numpy`
- need to create a function to convert `DataFrame` to `numpy.ndarray` then can use `FeatureUnion` to combine with `numpy.ndarray` that results from `numerical_pipeline` to form feature matrix

In [106]:
def convert_df_to_numpy(df):
    
    df = df.values
    
    return df

#### `FeatureUnion`
- to reassemble feature matrix from the product of multiple `pipelines`

# `PREOP` Feature Matrix Pipelines

### `PREOP_numerical_features` Pipeline

In [107]:
PREOP_numerical_features_pipeline = Pipeline(steps=[
    ('select', FeatureSelector(PREOP_numerical_col_names)), 
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler()) 
    ])

- `fit_transform` on `train` data, `transform` only on `dev` and `test` data

In [108]:
PREOP_numerical = PREOP_numerical_features_pipeline.fit_transform(X_train_all)

In [109]:
PREOP_numerical.shape, len(PREOP_numerical_col_names)

((23156, 11), 11)

### `PREOP_categorical_features_pipeline`

In [110]:
PREOP_categorical_features_pipeline = Pipeline(steps=[
    ('select', FeatureSelector(PREOP_categorical_col_names)),
    ('convert_numpy', FunctionTransformer(convert_df_to_numpy, validate=False))
    ])

- `fit_transform` on `train` data, `transform` only on `dev` and `test` data

In [111]:
PREOP_categorical = PREOP_categorical_features_pipeline.fit_transform(X_train_all)

In [112]:
PREOP_categorical.shape, len(PREOP_categorical_col_names)

((23156, 98), 99)

## Assembling `PREOP_feature_matrix` via `FeatureUnion`

In [113]:
PREOP_feature_matrix = FeatureUnion([('PREOP_num_features', PREOP_numerical_features_pipeline),
                                     ('PREOP_cat_features', PREOP_categorical_features_pipeline)
                                    ])

In [114]:
type(PREOP_feature_matrix)

sklearn.pipeline.FeatureUnion

### Transforming `X_train_all` to get `X_train_PREOP`

In [115]:
X_train_PREOP = PREOP_feature_matrix.fit_transform(X_train_all)

- validating

In [116]:
X_train_PREOP.shape, y_train.shape

((23156, 109), (23156,))

- putting into a `DataFrame` for easy analysis

In [117]:
PREOP_feature_col_names = PREOP_numerical_col_names + PREOP_categorical_col_names

In [118]:
len(PREOP_feature_col_names), len(PREOP_numerical_col_names), len(PREOP_categorical_col_names)

(110, 11, 99)

In [124]:
#PREOP_feature_col_names.remove('incidencREOP_FOURTH')

In [125]:
X_train_PREOP = pd.DataFrame(X_train_PREOP,
                             columns=PREOP_feature_col_names)

In [126]:
X_train_PREOP.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,-0.82818,0.55124,1.96124,0.97889,-1.47015,-0.14268,0.31205,0.52945,-0.27512,-0.16511,0.97676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.80602,-1.38063,-2.30378,-1.22765,-0.27614,-0.46271,-0.47966,-0.41307,-0.60388,-0.0006,-0.07777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.02552,1.99068,-0.27282,-0.75045,0.58206,0.18838,-0.87552,-0.27843,1.03243,-0.98767,-0.07777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.73283,0.55124,0.81697,0.28968,0.58206,0.40909,0.70791,-0.68237,1.16692,0.9042,-0.91762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.31599,1.80128,0.56929,-0.26561,0.3955,-0.58409,-1.46931,3.69365,0.06111,-2.63277,-0.07777,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Transforming `X_dev_all` to get `X_dev_PREOP`

In [127]:
X_dev_PREOP = PREOP_feature_matrix.transform(X_dev_all)

- validating

In [128]:
X_dev_PREOP.shape, y_dev.shape

((2894, 109), (2894,))

- putting into a `DataFrame` for easy analysis

In [129]:
X_dev_PREOP = pd.DataFrame(X_dev_PREOP,
                           columns=PREOP_feature_col_names)

In [130]:
X_dev_PREOP.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,-0.04767,0.57018,1.21325,0.52066,1.32832,-0.47374,-0.87552,-0.27843,0.06111,0.24616,-0.07777,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.43793,1.75393,2.93214,0.988,-0.1642,0.29873,-0.28173,-0.2111,0.50941,-1.56345,-0.07777,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.73283,-2.3371,-1.52112,-0.30322,-1.9179,6.25781,-0.67759,-0.14378,4.28637,1.06871,1.35564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.2428,1.04368,3.0114,1.35852,-0.42539,0.29873,-0.67759,1.94324,0.52062,0.49293,2.99743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.2428,0.09668,-0.17375,-0.161,0.41415,-0.25303,-0.87552,-0.27843,-0.60388,-1.39894,-0.07777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Transforming `X_test_all` to get `X_test_PREOP`

In [131]:
X_test_PREOP = PREOP_feature_matrix.transform(X_test_all)

- validating

In [132]:
X_test_PREOP.shape, y_test.shape

((2895, 109), (2895,))

- putting into a `DataFrame` for easy analysis

In [133]:
X_test_PREOP = pd.DataFrame(X_test_PREOP,
                            columns=PREOP_feature_col_names)

In [134]:
X_test_PREOP.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,0.63527,1.51718,2.1049,0.64651,0.3955,-0.03233,0.31205,0.32748,-0.05097,0.24616,-0.07777,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.92796,0.09668,-0.12421,-0.12949,0.58206,-0.36339,0.31205,0.59678,-0.68607,0.65744,0.21901,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.21843,0.78799,1.56496,0.63988,0.95519,-0.25303,0.90584,1.40466,-0.68607,0.24616,-0.07777,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.34258,-1.13441,-1.09016,-0.37898,-0.18285,-0.02129,0.11412,-0.07646,-0.27886,-0.57639,0.72418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.43793,-0.41469,1.11418,0.89155,-0.72389,0.40909,-1.46931,1.5393,1.16692,0.49293,-0.53874,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### NOTE:
- did not have to put feature matricies into `DataFrames` - could have used directly in pipelines, but wanted to put into `DataFrames` in case wanted to retrieve feature names

# `POSTOP` Feature Matrix Pipelines

### `POSTOP_numerical_features` Pipeline

In [135]:
POSTOP_numerical_features_pipeline = Pipeline(steps=[
    ('select', FeatureSelector(POSTOP_numerical_col_names)), 
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler()) 
    ])

- `fit_transform` on `train` data, `transform` only on `dev` and `test` data

In [136]:
POSTOP_numerical = POSTOP_numerical_features_pipeline.fit_transform(X_train_all)

In [137]:
POSTOP_numerical.shape, len(POSTOP_numerical_col_names)

((23156, 15), 15)

### `POSTOP_categorical_features_pipeline`

In [138]:
POSTOP_categorical_features_pipeline = Pipeline(steps=[
    ('select', FeatureSelector(POSTOP_categorical_col_names)),
    ('convert_numpy', FunctionTransformer(convert_df_to_numpy, validate=False))
    ])

- `fit_transform` on `train` data, `transform` only on `dev` and `test` data

In [139]:
POSTOP_categorical = POSTOP_categorical_features_pipeline.fit_transform(X_train_all)

In [140]:
POSTOP_categorical.shape, len(POSTOP_categorical_col_names)

((23156, 100), 103)

## Assembling `POSTOP_feature_matrix` via `FeatureUnion`

In [141]:
POSTOP_feature_matrix = FeatureUnion([('POSTOP_num_features', POSTOP_numerical_features_pipeline),
                                      ('POSTOP_cat_features', POSTOP_categorical_features_pipeline)
                                     ])

In [142]:
type(POSTOP_feature_matrix)

sklearn.pipeline.FeatureUnion

### Transforming `X_train_all` to get `X_train_POSTOP`

In [143]:
X_train_POSTOP = POSTOP_feature_matrix.fit_transform(X_train_all)

- validating

In [144]:
X_train_POSTOP.shape, y_train.shape

((23156, 115), (23156,))

- putting into a `DataFrame` for easy analysis

In [145]:
POSTOP_feature_col_names = POSTOP_numerical_col_names + POSTOP_categorical_col_names

In [146]:
len(POSTOP_feature_col_names), len(POSTOP_numerical_col_names), len(POSTOP_categorical_col_names)

(118, 15, 103)

In [147]:
X_train_POSTOP = pd.DataFrame(X_train_POSTOP,
                              columns=POSTOP_feature_col_names)

ValueError: Shape of passed values is (23156, 115), indices imply (23156, 118)

In [None]:
X_train_POSTOP.head()

### Transforming `X_dev_all` to get `X_dev_POSTOP`

In [None]:
X_dev_POSTOP = POSTOP_feature_matrix.transform(X_dev_all)

- validating

In [None]:
X_dev_POSTOP.shape, y_dev.shape

- putting into a `DataFrame` for easy analysis

In [None]:
X_dev_POSTOP = pd.DataFrame(X_dev_POSTOP,
                            columns=POSTOP_feature_col_names)

In [None]:
X_dev_POSTOP.head()

### Transforming `X_test_all` to get `X_test_POSTOP`

In [None]:
X_test_POSTOP = POSTOP_feature_matrix.transform(X_test_all)

- validating

In [None]:
X_test_POSTOP.shape, y_test.shape

- putting into a `DataFrame` for easy analysis

In [None]:
X_test_POSTOP = pd.DataFrame(X_test_POSTOP,
                             columns=POSTOP_feature_col_names)

In [None]:
X_test_POSTOP.head()

## Assembling `PREOP + POSTOP` Combined Feature Matrix

#### -  Using `FeatureUnion | Pipeline` to Create Combined `PREOP + POSTOP` Feature Matrix

In [None]:
PREPOST_feature_matrix = FeatureUnion([('PREOP_num_features', PREOP_numerical_features_pipeline),
                                       ('PREOP_cat_features', PREOP_categorical_features_pipeline),
                                       ('POSTOP_num_features', POSTOP_numerical_features_pipeline),
                                       ('POSTOP_cat_features', POSTOP_categorical_features_pipeline)
                                      ])

- can use the code pattern above to create `X_train_PREPOST`, `X_dev_PREPOST`, `X_test_PREPOST`

In [None]:
PREPOST_feature_col_names = PREOP_feature_col_names + POSTOP_feature_col_names

In [None]:
len(PREPOST_feature_col_names), len(PREOP_feature_col_names), len(POSTOP_feature_col_names)

### Transforming `X_train_all` to get `X_train_PREPOST`

In [None]:
X_train_PREPOST = PREPOST_feature_matrix.fit_transform(X_train_all)

- validating

In [None]:
X_train_PREPOST.shape, y_train.shape

- putting into a `DataFrame` for easy analysis

In [None]:
X_train_PREPOST = pd.DataFrame(X_train_PREPOST,
                               columns=PREPOST_feature_col_names)

In [None]:
X_train_PREPOST.head()

### Transforming `X_dev_all` to get `X_dev_PREPOST`

In [None]:
X_dev_PREPOST = PREPOST_feature_matrix.transform(X_dev_all)

- validating

In [None]:
X_dev_PREPOST.shape, y_dev.shape

- putting into a `DataFrame` for easy analysis

In [None]:
X_dev_PREPOST = pd.DataFrame(X_dev_PREPOST,
                             columns=PREPOST_feature_col_names)

In [None]:
X_dev_PREPOST.head()

### Transforming `X_test_all` to get `X_test_PREPOST`

In [None]:
X_test_PREPOST = PREPOST_feature_matrix.transform(X_test_all)

- validating

In [None]:
X_test_PREPOST.shape, y_test.shape

- putting into a `DataFrame` for easy analysis

In [None]:
X_test_PREPOST = pd.DataFrame(X_test_PREPOST,
                              columns=PREPOST_feature_col_names)

In [None]:
X_test_PREPOST.head()

## Modeling Scenarios Using `PREOP` and `POSTOP` Datasets

### - Scenario A: `Kitchen Sink` - Modeling the Combined `PREOP` and `POSTOP` Dataset

- code belows shows feature matrix assembly using `Pipeline` - could have used the already named `feature matricies` created above: `X_train_PREPOST`, `X_dev_PREPOST` and `X_test_PREPOST` in conjunction with their respective `y` vectors

#### `LogisticRegression()` with `GridSearchCV`

- instantiating `Pipeline`

In [None]:
PREPOST_modelA_pipe = Pipeline(steps=[
    ('get_PREPOST_feature_matrix', PREPOST_feature_matrix),
    ('Classifier', LogisticRegression())
    ])

- defining a parameter grid for `GridSearchCV`

In [None]:
parameter_grid = {'Classifier__penalty': ['l1'],
                  'Classifier__C': [0.001, 0.01, 1.0],
                  'Classifier__class_weight': ['balanced'],
                  'Classifier__solver': ['liblinear'],
                  'Classifier__random_state': [0]
                 }

- defining `scoring_metrics`

In [None]:
scoring_metrics = ['accuracy',
                   'f1',
                   'f1_macro',
                   'f1_weighted',
                   'precision',
                   'precision_macro',
                   'precision_weighted',
                   'recall',
                   'recall_macro',
                   'recall_weighted',
                   'roc_auc']

- create `GridSearchCV` object

In [None]:
gs = GridSearchCV(PREPOST_modelA_pipe, #pipline
                  param_grid=parameter_grid, #param_grid
                  scoring=scoring_metrics, #loss functions
                  cv=5, #number of folds, default=`StratifiedKFold`
                  refit='roc_auc', # best_estimator_ will be based on this scoring metric
                  return_train_score=True,
                  n_jobs=-1,
                  verbose=5)

- fitting `GridSearchCV`

In [None]:
PREPOST_modelA = gs.fit(X_train_all, y_train)

- examining results

In [None]:
gs_results = pd.DataFrame(PREPOST_modelA.cv_results_).sort_values(by=['mean_test_roc_auc'],
                                                                  ascending=False)

In [None]:
display_cols = ['param_Classifier__C',
                'param_Classifier__penalty',
                'mean_test_roc_auc',
                'mean_train_roc_auc',
                'mean_test_f1',
                'mean_test_precision',
                'mean_test_recall']

In [None]:
gs_results_summary = gs_results.copy()[display_cols].reset_index(drop=True)

In [None]:
gs_results_summary.rename(columns={'param_Classifier__C': 'C',
                                   'param_Classifier__penalty': 'penalty'},
                          inplace=True)

In [None]:
gs_results_summary

#### Key Takeaways
- `LogisticRegression()` with `C=0.01` beats `STS`

### - Scenario B: `Model Stacking` - Incorporate Predictions from `PREOP` Training with `POSTOP` Data
- this time, we will make use of pre-saved `X_train_PREOP` and `X_train_POSTOP` feature matricies created above via `Pipeline` 

In [None]:
X_train_PREOP.shape, X_train_POSTOP.shape, y_train.shape

#### Step 1.  Generate vector of probabilities of stroke for using `PREOP` features and cross validated training set
- `LogisticRegression` hyperparameters were tuned in a separate notebook

In [None]:
lr_predict_probs = cross_val_predict(LogisticRegression(penalty='l1',
                                                        C=0.0275,
                                                        class_weight='balanced',
                                                        random_state=0,
                                                        solver='liblinear'),
                                     X_train_PREOP,
                                     y_train,
                                     cv=5,
                                     method='predict_proba',
                                     n_jobs=-1,
                                     verbose=5)

- `cross_val_predict` returns an `numpy.ndarray` with `shape` `(n, 2)` as it returns a probability per class `(0, 1)`
- since we are interested in the `1` class, need to carve our the second column of the `numpy.ndarray`
- add `[:,1]`

- examining `lr_predict_probs`

In [None]:
type(lr_predict_probs)

In [None]:
lr_predict_probs.shape

In [148]:
lr_predict_probs[0:5]

NameError: name 'lr_predict_probs' is not defined

In [None]:
lr_predict_probs[0:5, 1]

### Step 1a.  Generate additional vector of probabilities of stroke for using `PREOP` features and CV training set
- Not limited to stacking one `PREOP` model
- Running `RandomForestClassifier` `PREOP` model 
- hyperparameters were tuned in a separate notebook - using AP's highest scoring model
- going to assemble `X_train_PREOP_TREE` using `Pipelines`
- can utilize existing `PREOP_numerical_features_pipeline`
- need to define `PREOP_categorial_TREE_features_pipeline`
- use`FeatureUnion` to combine to create `PREOP` feature matrix usable for `RandomForestClassifier`

- creating `Pipeline` to assemble `PREOP_categorical_TREE_features_pipeline`

In [None]:
PREOP_categorical_TREE_features_pipeline = Pipeline(steps=[
    ('select', FeatureSelector(PREOP_categorical_TREE_col_names)),
    ('convert_numpy', FunctionTransformer(convert_df_to_numpy, validate=False))
    ])

In [None]:
PREOP_feature_matrix_TREE = FeatureUnion([('PREOP_num_features', PREOP_numerical_features_pipeline),
                                          ('PREOP_cat_tree_features', PREOP_categorical_TREE_features_pipeline)
                                         ])

- combining with `PREOP_numerical_features_pipeline` defined above to assemble `PREOP_feature_matrix_TREE`
- if plan is to assemble `feature_matrix` but not use in a modeling pipeline as is done below, then can assign to a variable name like `X_train_PREOP_TREE` -- did this above for `X_train_PREOP` 

### Taking a Slight Detour to Create `Pipelines` to demonstrate how to assemble `POSTOP_feature_matrix_TREE` using `sklearn` `Pipelines` 

- can use similar coding pattern to assemble `POSTOP_feature_matrix_TREE` if needed
    - `POSTOP_numerical_features_pipeline` has been previously created above
    - would need to create `POSTOP_categorical_TREE_features_pipeline` -- THIS HAS NOT BEEN DONE YET
    - then use `FeatureUnion()` to assemble `POSTOP_feature_matrix_TREE`
    - similar to what was done above in using `Pipeline`s to create `X_train_PREOP`, `X_train_POSTOP` and corresponding `X_dev` and `X_test` 

In [None]:
POSTOP_categorical_TREE_features_pipeline = Pipeline(steps=[
    ('select', FeatureSelector(POSTOP_categorical_TREE_col_names)),
    ('convert_numpy', FunctionTransformer(convert_df_to_numpy, validate=False))
    ])

- using `FeatureUnion` combining using with already created `POSTOP_numerical_features_pipeline` to assemble `POSTOP_feature_matrix_TREE`

In [None]:
POSTOP_feature_matrix_TREE = FeatureUnion([('POSTOP_num_features', POSTOP_numerical_features_pipeline),
                                           ('POSTOP_cat_tree_features', POSTOP_categorical_TREE_features_pipeline)
                                          ])

#### NOTE - as we did above:

- `fit_transform` only on `X_train_all` for `X_train_POSTOP_TREE`
- `transform` on `X_dev_all` and `X_test_all` for `X_dev_POSTOP_TREE` and `X_test_POSTOP_TREE`
- assign compreleted `feature_matrix` to variable name if not using directly in a `Pipeline` with classifier step

### Now back to modeling `PREOP_dataset_TREE` using `RandomForestClassifier()`

- creating `PREOP_RF_CLF_pipe`

In [None]:
PREOP_RF_clf_pipe = Pipeline(steps=[
    ('get_PREOP_TREE_feature_matrix', PREOP_feature_matrix_TREE),
    ('Classifier', RandomForestClassifier(n_estimators=315,
                                          max_features=0.1,
                                          max_depth=None,
                                          min_samples_split=50,
                                          class_weight='balanced',
                                          random_state=0))])

- in this scenario - assuming already have tuned the hyperparamenters
- alternatively, could have run a `GridSearchCV` first to tune the hyperparameters and then use the `best_estimator_` to get estimated probabilities of stroke with the code below

- running `RandomForestClassifer`

In [None]:
rf_predict_probs = cross_val_predict(PREOP_RF_clf_pipe,
                                     X_train_all,
                                     y_train,
                                     cv=5,
                                     method='predict_proba',
                                     n_jobs=-1,
                                     verbose=5)

- examining `rf_predict_probs`

In [None]:
rf_predict_probs.shape

In [None]:
rf_predict_probs[0:5, 1]

### Step 2: Incorporate vector of predicted probabilities into `X_train_POSTOP`
- adding `lr_predict_probs`
- adding `rf_predict_probs`
- adding `STS` model predicted probability using `predstro_train`

In [None]:
X_train_POSTOP_STACKED = X_train_POSTOP.copy()

In [None]:
X_train_POSTOP_STACKED.shape, X_train_POSTOP.shape

- adding vectors of `predict_probas` from `PREOP` models and `STS` models to `POSTOP` feature matrix

In [None]:
X_train_POSTOP_STACKED['LR_PREOP_PROBA'] = lr_predict_probs[:, 1]
X_train_POSTOP_STACKED['RF_PREOP_PROBA'] = rf_predict_probs[:, 1]
X_train_POSTOP_STACKED['STS_predstro'] = predstro_train

In [None]:
X_train_POSTOP_STACKED.head()

In [None]:
X_train_POSTOP_STACKED.shape

### Step 3: Model and Evaluate `X_train_POSTOP_STACKED`

- instantiating a `LogisticRegression()` object

In [None]:
log_reg_clf = LogisticRegression()

- will use scoring metrics defined above in previous `GridSearchCV`
- defining hyperparameter grid

In [None]:
hyperparameters = {'penalty': ['l1'],
                   'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
                   'class_weight': ['balanced'],
                   'solver': ['liblinear'],
                   'random_state': [0]
                  }

- create `GridSearchCV` object

In [None]:
gs_stacked = GridSearchCV(log_reg_clf, #classifier
                          param_grid=hyperparameters, #param_grid
                          scoring=scoring_metrics, #loss functions
                          cv=5, #number of folds, default=`StratifiedKFold`
                          refit='roc_auc', # best_estimator_ will be based on this scoring metric
                          return_train_score=True,
                          n_jobs=-1,
                          verbose=5)

- fitting `GridSearchCV`

In [None]:
PREPOST_modelB = gs_stacked.fit(X_train_POSTOP_STACKED, y_train)

- examining results

In [None]:
gs_results = pd.DataFrame(PREPOST_modelB.cv_results_).sort_values(by=['mean_test_roc_auc'],
                                                                      ascending=False)

In [None]:
display_cols = ['param_C',
                'mean_test_roc_auc',
                'mean_train_roc_auc',
                'mean_test_f1',
                'mean_test_precision',
                'mean_test_recall']

In [None]:
gs_results_summary = gs_results.copy()[display_cols].reset_index(drop=True)

In [None]:
gs_results_summary

### Key Takeaways
- similar to `Modeling Scenario A` achieved better results than `STS` with the `POSTOP` features
- results were slightly below that of `Modeling Scenario A` on a `mean_test_roc_auc` basis
- extend this stacking concept to add additional vectors of probability of stroke from other models `RandomForest` and `STS`
- adding the `RandomForest` predicted probabilities improved results slightly, but the addition of `STS` on top of that did not improve the `mean_test_roc_auc`
- instead of modeling the stacked `POSTOP` data with `LogisticRegression` can try another model such as `RandomForest` or `SVM`
- in addition can also try using `PolynomialFeatures` transformation using `Scenario B` to see if any benefit from modeling the interaction of the `PREOP` predictions with the additional `POSTOP` features
- should examine the non-zero coeficients in `POSTOP` to get insight into what `POSTOP` features are useful in enhancing stroke prediction

## CODE BELOW IS USEFUL FOR DIAGNOSTIC CHARTS ON MODEL SCORING

### Need to modify and adapt for code above given changes in variable names

In [None]:
break

# ---------------------------------------------------------------------------------------------

## Visualizing the Effect of `LogisticRegression()` Hyperparameter Values
- See Albon `11.13` pages 205-207
- range of values for `C` - inverse of regularization strength

In [None]:
param_values = np.linspace(0.001, 0.05, 25)

- calculating `roc_auc` on training and test set using `param_range` for `C`

In [None]:
train_scores, test_scores = validation_curve(LogisticRegression(penalty='l1',
                                                                class_weight='balanced',
                                                                random_state=0,
                                                                solver='liblinear'), #classifier
                                             X_train, #feature matrix
                                             y_train, #target vector
                                             param_name='C', #hyperparameter to examine,
                                             param_range=param_values, #range of hyperparameter values
                                             cv=5, #number of folds, default=`StratifiedKFold`
                                             scoring='roc_auc', #scoring metric
                                             n_jobs=-1, #use all computer cores
                                             verbose=5) 

- calculate `mean` and `standard deviation` for `training` set scores

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

- calculate `mean` and `standard deviation` for `test` set scores

In [None]:
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

- Plotting

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('fivethirtyeight')

# plot mean accuracy scores for training and test sets
plt.plot(param_values,
         train_mean,
         label='Training Score',
         color='midnightblue')

plt.plot(param_values,
         test_mean,
         label='Cross Validation Score',
         color='darkviolet')

# plot accuracy bands for training and test sets
plt.fill_between(param_values, 
                 train_mean - train_std,
                 train_mean + train_std,
                 color='lightsteelblue')

plt.fill_between(param_values,
                 test_mean - test_std,
                 test_mean + test_std,
                 color='thistle')

# create plot
plt.title('Validation Curve for Logistic Regression with l1 Regularization')
plt.xlabel('C')
plt.ylabel('AUCROC')
plt.legend(loc='lower right')
plt.show()       

In [None]:
validation_summary = pd.DataFrame({'C': param_values,
                                   'mean_test_roc_auc': test_mean,
                                   'mean_train_roc_auc': train_mean})

In [None]:
validation_summary[validation_summary['mean_test_roc_auc'] == np.max(validation_summary['mean_test_roc_auc'])]

#### How does this tuned model compare to the baseline `STS` Model

In [None]:
round(roc_auc_score(y_train, predstro_train), 5)

#### Key Takeaways
- Fine tuning of hyperparmeter `C` results in `C=0.0275`
- Results in-line (but still lower) than `STS` Model

## Precision-Recall Curve for Tuned Logistic Regression Model
- `Hands On Machine Learning with Scikit-Learn & Tensorflow` by Aurelien Geron pages 83-93
#### How `cross_val_predict` works:
- `cross_val_predict` returns for each element in the input, the prediction that was obtained for that element when it was in the `test` set

- generating vector of probabilities of stroke for cross validated training set

In [None]:
lr_predict_probs = cross_val_predict(LogisticRegression(penalty='l1',
                                                        C=0.0275,
                                                        class_weight='balanced',
                                                        random_state=0,
                                                        solver='liblinear'),
                                     X_train,
                                     y_train,
                                     cv=5,
                                     method='predict_proba',
                                     n_jobs=-1,
                                     verbose=5)

- using `lr_predict_probs` can compute precision and recall for all possible thresholds using the `precision_recall_curve` function
- returns an `numpy.ndarray` with `shape` `(n, 2)` as it returns a probability per class `(0, 1)`
- since we are interested in the `1` class, need to carve our the second column of the `numpy.ndarray`
- add `[:,1]`

In [None]:
type(lr_predict_probs)

In [None]:
lr_predict_probs.shape

In [None]:
lr_predict_probs[0:5]

In [None]:
lr_predict_probs[0:5, 1]

#### Generating `precision_recall_curve`

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train,
                                                         lr_predict_probs[:, 1])

- `precision` values such that element `i` is the the `precision` of predictions with `score>=threholds[i]` and the last element is `1`

In [None]:
precisions[0:5] # first 5 elements of the numpy array

In [None]:
precisions[-5:] # last 5 elements of the numpy array

- decreasing `recall` values such that element `i` is the the `precision` of predictions with `score>=threholds[i]` and the last element is `0`

In [None]:
recalls[0:5]

In [None]:
recalls[-5:]

- increasing `thresholds` on the decision function used to compute `precision` and `recall`

In [None]:
thresholds[0:5]

In [None]:
thresholds[-5:]

### Plotting `precision` and `recall` as functions of `threshold`

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.title('Precision and Recall versus Threshold')
    plt.xlabel('Threshold')
    plt.legend(loc='upper right')
    plt.ylim([0, 1])

### Precision and Recall versus Threshold for Logistic Regression Model

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

- helper function

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx], idx

- examining specific `threshold`, `Recall` and `Precision` trade-offs
- suppose you wanted a `recall` of `0.80`, what `threshold` will you need and what will the `precision` be?

In [None]:
find_nearest(recalls, 0.80)

In [None]:
print('{0:15} {1:5f}'.format('Recall:', recalls[14806]))
print('{0:15} {1:5f}'.format('Precision:', precisions[14806]))
print('{0:15} {1:5f}'.format('Threshold:', thresholds[14806]))

- see `Geron` page 90 for additional details

### Precision and Recall versus Threshold for STS Model

In [None]:
sts_precisions, sts_recalls, sts_thresholds = precision_recall_curve(y_train,
                                                                     predstro_train)

In [None]:
plot_precision_recall_vs_threshold(sts_precisions, sts_recalls, sts_thresholds)
plt.show()

## Precision-Recall Curves
- `Introduction to Machine Learning with Python` by Andreas C. Muller and Sarah Guido pages 289-296
- the closer a `precision-recall curve` stays to the upper-right corner, the better classifier

In [None]:
def plot_precision_recall_curve(precisions, recalls):
    plt.plot(precisions,
             recalls,
             color='blue',
             label='Precision-Recall Curve')
    plt.title('Precision-Recall Curve')
    plt.ylabel('Recall')
    plt.xlabel('Precision')
    plt.legend(loc='upper right')

### `Precision-Recall Curve` for `LogisticRegression` Model

In [None]:
plot_precision_recall_curve(precisions, recalls)
plt.show()

#### Area Under the `Precision-Recall Curve` for `LogisticRegression` Model

In [None]:
round(auc(recalls, precisions), 5)

### `Precision-Recall Curve` for `STS` Model

In [None]:
plot_precision_recall_curve(sts_precisions, sts_recalls)
plt.show()

#### Area Under the `Precision-Recall Curve` for `STS` Model

In [None]:
round(auc(sts_recalls, sts_precisions), 5)

#### Key Takeaways
- `STS` model is superior to `LogisticRegression` model looking at `Precision-Recall Curves` and `PR AUC`

### Comparing `Logistic Regression` and `STS` Model `Precision-Recall` Curves

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('fivethirtyeight')

# plotting `LogisticRegression` Precision-Recall Curve
plt.plot(precisions,
         recalls,
         'b--',
         label='Logistic Regression Model')

# plotting `STS Model` Precision-Recall Curve
plt.plot(sts_precisions,
         sts_recalls,
         'g-',
         label='STS Model')

plt.title('Logistic Regression, STS Model Precision-Recall Curves')
plt.ylabel('Recall')
plt.xlabel('Precision')
plt.legend(loc='upper right')
plt.xlim([0, 0.10]) # cut off values > 0.10 because they were essentially zero
plt.show()

#### Key Takeaways
- plot clearly shows `STS` model outperperformance over `LogisticRegression`
- can use basic code here to plot `Precision-Recall Curves` from other models on the same plot for comparison

## Receiver Operating Characteristic (`ROC`) Curves
- Albon `11.5` pages 189-192
- a classifier that predicts every observation correctly would look like the solid `gray` line in the plotted `ROC` curves below
- the solid `gray` line goes straight up to the top immediately
- a classifier that predicts at random will appear as the diagonal line
- the better the model, the closer it is to the solid `gray` line
- the `ROC` curve represents the respective `TPR` and `FPR` for every probability threshold

- creating `true` and `false` probabilities

In [None]:
false_positive_rate, true_positive_rate, threshold = roc_curve(y_train,
                                                               lr_predict_probs[:, 1])

- `plot_roc_curve` function

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.title('Receiver Operating Curve (ROC)')
    plt.plot(fpr, tpr)
    plt.plot([0, 1], ls='--')
    plt.plot([0, 0],
             [1, 0],
             c='0.7'),
    plt.plot([1, 1], c='0.7')
    plt.ylabel('True Positive Rate (Recall)')
    plt.xlabel('False Positive Rate')

#### `LogisiticRegression` Model `ROC` Curve

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('fivethirtyeight')
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()

In [None]:
round(roc_auc_score(y_train, lr_predict_probs[:, 1]), 5)

- suppose you want `recall` in a certain range - what is the `false positive rate` you would have to accept?

In [None]:
lr_roc_curve = pd.DataFrame({'Recall': true_positive_rate,
                             'FPR': false_positive_rate})

- assume you want `recall` around `0.80`

In [None]:
lr_roc_curve[lr_roc_curve['Recall'].between(0.799, 0.801)]

- could have also used the `find_nearest` helper function above

In [None]:
lr_roc_curve[lr_roc_curve['Recall'] == (find_nearest(lr_roc_curve['Recall'], 0.80)[0])]

- suppose you want to know `recall` and `FPR` for a given threshold

In [149]:
np.where(threshold == 0.10)[0]

NameError: name 'threshold' is not defined

In [None]:
print('{0:15} {1:5f}'.format('Threshold:', threshold[11]))
print('{0:15} {1:5f}'.format('Recall:', true_positive_rate[11]))
print('{0:15} {1:5f}'.format('FPR:', false_positive_rate[11]))

#### `STS` Model `ROC` Curve

In [None]:
sts_fpr, sts_tpr, threshold = roc_curve(y_train,
                                        predstro_train)

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('fivethirtyeight')
plot_roc_curve(sts_fpr, sts_tpr)
plt.show()

In [None]:
round(roc_auc_score(y_train, predstro_train), 5)

#### Key Takeaways
- `STS` Model slightly outperforms `LogisticRegression`