# Capstone Project

## Pre-Processing Notebook

### `PREOP + POSTOP DATASET` from `11/9/19` for Use with `sklearn` `Pipelines`

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.1
Running numpy version: 1.14.2
Running sklearn version: 0.21.3


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.Capstone - STS risk factor list.xlsx.icloud',
 '.DS_Store',
 '.capstone_STS_risk_factor_features.xlsx.icloud',
 '.capstone_data-version-2.xlsx.icloud',
 '.capstone_data.xlsx.icloud',
 '.capstone_data_binarized_outcome.xlsx.icloud',
 '.capstone_data_filled_in_complication_data.xlsx.icloud',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'POSTOP_ALL_col_names_11_9.pkl',
 'POSTOP_TREE_ALL_col_names_11_9.pkl',
 'POSTOP_categorical_TREE_col_names_11_9.pkl',
 'POSTOP_categorical_col_names_11_9.pkl',
 'POSTOP_numerical_col_names_11_9.pkl',
 'PREOP_ALL_col_names_11_9.pkl',
 'PREOP_TREE_ALL_col_names_11_9.pkl',
 'PREOP_categorical_TREE_col_names_11_9.pkl',
 'PREOP_categorical_col_names_11_9.pkl',
 'PREOP_dataset_10_24.pkl',
 'PREOP_dataset_10_27.pkl',
 'PREOP_dataset_TREE_10_24.pkl',
 'PREOP_dataset_TREE_10_27.pkl',
 'PREOP_numerical_col_names_11_9.pkl',
 'PRE_

### Loading Dataset

#### `PREOP + POSTOP DATASET` from `11/2/19`

In [7]:
data = pd.read_pickle('PRE_plus_POST_dataset_11_9.pkl')

In [8]:
data.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.014,2,0,2,0,0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0.017,2,0,1,0,0,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
data.shape

(42740, 239)

### Dropping Irrelevant Columns

- `recordId`
- `cnstrokp`
- `cnstrokttia`
- `cncomaenceph`
- `strokeBin` as we are going to use the more inclusive `strokeBin2` as our outcome variable

#### Relative to prior versions, we are also going to retain `predstro` that will be used in our `thresholding` analysis

In [10]:
cols_to_drop = ['recordId',
                'cnstrokp',
                'cnstrokttia',
                'cncomaenceph',
                'strokeBin']

In [11]:
len(cols_to_drop)

5

- dropping columns

In [12]:
data = data.drop(cols_to_drop, axis=1)

In [13]:
data.shape

(42740, 234)

- resetting `DataFrame` `index` 

In [14]:
data = data.reset_index(drop=True)

In [15]:
data.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,predstro,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.014,0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.017,0,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.045,0,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.013,0,,0.0,0.0,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.016,0,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
data.tail()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,predstro,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
42735,62,182.89999,98.3,29.38503,43.3,0.9,4.1,5.5,6.4,50.0,33.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.007,0,,0.0,0.0,,,,,,30.0,34.9,77.0,0.8,,,66.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42736,82,165.10001,74.9,27.47816,31.3,1.59,4.0,7.8,10.85,30.0,,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.044,0,,0.0,0.0,,,,,,23.0,34.7,142.0,2.5,,,122.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,1.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42737,66,175.3,75.3,24.50367,46.2,0.83,3.8,5.3,7.47,58.0,,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.007,0,,0.0,0.0,,,,,,41.0,34.6,,0.8,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42738,62,165.10001,107.5,39.43794,46.1,0.77,3.8,5.3,7.47,55.0,,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.008,0,,0.0,0.0,,,,,,32.0,35.3,72.0,0.7,,,54.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42739,78,180.3,87.5,26.91638,40.4,1.14,3.8,5.8,8.73,50.0,33.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.009,1,,0.0,0.0,,,,,,28.0,35.6,100.0,1.5,,,57.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
data.shape

(42740, 234)

### Feature Matrix `X`
- in the next cell you can select which outcome vector you want to use: `strokeBin` or `strokeBin2`

In [18]:
X = data.copy().drop('strokeBin2', axis=1)

In [19]:
X = X.reset_index(drop=True)

In [20]:
X.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,predstro,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.014,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.017,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
X.shape

(42740, 233)

### Target Vector `y`

In [22]:
y = data.copy()['strokeBin2']

In [23]:
type(y)

pandas.core.series.Series

In [24]:
y.head(2)

0    0
1    0
Name: strokeBin2, dtype: int64

In [25]:
y.shape

(42740,)

In [26]:
y.unique()

array([0, 1])

In [27]:
y.value_counts()

0    42012
1      728
Name: strokeBin2, dtype: int64

### `X_train`, `X_devtest`, `y_train`, `y_devtest`
- using `train_test_split` with `stratify` parameter to ensure relative proportion of outcome classes are the same in `train`, `dev` and `test` sets
- observation split will be `80/10/10` between `train`, `dev`, `test`

In [28]:
X_train, X_devtest, y_train, y_devtest = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=0,
                                                          stratify=y)

#### validating `train_test_split`

In [29]:
X.shape

(42740, 233)

In [30]:
np.rint(X.shape[0] * 0.20)

8548.0

In [31]:
X_train.shape, X_devtest.shape, y_train.shape, y_devtest.shape

((34192, 233), (8548, 233), (34192,), (8548,))

In [32]:
X.shape[0] - X_train.shape[0] - X_devtest.shape[0]

0

In [33]:
y.shape[0] - y_train.shape[0] - y_devtest.shape[0]

0

#### validating `stratify` worked
- relative proportion of classes in `y`

In [34]:
print (np.round(y.value_counts()[0] / y.shape[0], 4))
print (np.round(y.value_counts()[1] / y.shape[0], 4))

0.983
0.017


- relative proportion of classes in `y_train`

In [35]:
print (np.round(y_train.value_counts()[0] / y_train.shape[0], 4))
print (np.round(y_train.value_counts()[1] / y_train.shape[0], 4))

0.983
0.017


- relative proportion of classes in `y_devtest`

In [36]:
print (np.round(y_devtest.value_counts()[0] / y_devtest.shape[0], 4))
print (np.round(y_devtest.value_counts()[1] / y_devtest.shape[0], 4))

0.9829
0.0171


### `X_dev`, `X_test`, `y_dev`, `y_test`

In [37]:
X_dev, X_test, y_dev, y_test = train_test_split(X_devtest,
                                                y_devtest,
                                                test_size=0.5,
                                                random_state=0,
                                                stratify=y_devtest)

- validating `train_test_split`

In [38]:
X_devtest.shape

(8548, 233)

In [39]:
np.rint(X_devtest.shape[0] * 0.50)

4274.0

In [40]:
X_dev.shape, X_test.shape, y_dev.shape, y_dev.shape

((4274, 233), (4274, 233), (4274,), (4274,))

In [41]:
X_devtest.shape[0] - X_dev.shape[0] - X_test.shape[0]

0

In [42]:
y_devtest.shape[0] - y_dev.shape[0] - y_test.shape[0]

0

#### validating `stratify` worked
- relative proportion of classes in `y`

In [43]:
print (np.round(y.value_counts()[0] / y.shape[0], 4))
print (np.round(y.value_counts()[1] / y.shape[0], 4))

0.983
0.017


- relative proportion of classes in `y_devtest`

In [44]:
print (np.round(y_devtest.value_counts()[0] / y_devtest.shape[0], 4))
print (np.round(y_devtest.value_counts()[1] / y_devtest.shape[0], 4))

0.9829
0.0171


- relative proportion of classes in `y_dev`

In [45]:
print (np.round(y_dev.value_counts()[0] / y_dev.shape[0], 4))
print (np.round(y_dev.value_counts()[1] / y_dev.shape[0], 4))

0.9829
0.0171


- relative proportion of classes in `y_test`

In [46]:
print (np.round(y_test.value_counts()[0] / y_test.shape[0], 4))
print (np.round(y_test.value_counts()[1] / y_test.shape[0], 4))

0.9829
0.0171


### Resetting Indicies

In [47]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [48]:
X_dev = X_dev.reset_index(drop=True)
y_dev = y_dev.reset_index(drop=True)

In [49]:
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Extracting `predstro` from `X_train`, `X_dev` and `X_test`
- and subsequently creating `predstro_train`, `predstro_dev`, `predstro_test`

- `train`

In [50]:
X_train.shape

(34192, 233)

In [51]:
predstro_train = X_train['predstro']

In [52]:
predstro_train.shape

(34192,)

In [53]:
X_train = X_train.drop('predstro', axis=1)

In [54]:
X_train.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,43,172.7,96.2,32.25451,24.4,0.81,2.6,4.1,15.42,65.0,71.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,0.0,0.0,0.0,2.0,17.0,36.5,89.0,5.4,,,62.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,0,1,0.0,0.0,1.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,78,162.5,69.9,26.47101,38.2,1.01,,6.7,,57.0,41.0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,19.0,31.7,78.0,1.0,,,65.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [55]:
X_train.shape

(34192, 232)

- `dev`

In [56]:
X_dev.shape

(4274, 233)

In [57]:
predstro_dev = X_dev['predstro']

In [58]:
predstro_dev.shape

(4274,)

In [59]:
X_dev = X_dev.drop('predstro', axis=1)

In [60]:
X_dev.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,51,170.2,92.9,32.06983,44.2,1.05,3.6,8.1,6.87,65.0,,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,26.0,34.0,58.0,1.2,,,53.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,66,175.0,105.0,34.28571,41.0,0.9,3.9,5.7,7.5,50.0,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,29.0,35.5,84.0,0.9,71.0,73.0,68.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [61]:
X_dev.shape

(4274, 232)

- `test`

In [62]:
X_test.shape

(4274, 233)

In [63]:
predstro_test = X_test['predstro']

In [64]:
predstro_test.shape

(4274,)

In [65]:
X_test = X_test.drop('predstro', axis=1)

In [66]:
X_test.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,81,170.0,97.7,33.80623,37.0,0.9,4.6,6.2,6.4,60.0,38.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,,,,1.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,65,180.3,77.1,23.71717,44.2,1.02,,5.2,,60.0,29.3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,,,,,,31.0,32.5,68.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [67]:
X_test.shape

(4274, 232)

### Saving Unprocessed `X_train`, `y_train`, `X_dev`, `y_dev`, `X_test`, `y_test` and `predstro` vectors
- can be used in notebooks than incorporate `sklearn` `Pipelines` to more efficiently pre-process data than this notebook

- `train`

In [68]:
#X_train.to_pickle('X_train_PREPOST_UNPROC_11_9.pkl')

In [69]:
#y_train.to_pickle('y_train_PREPOST_UNPROC_11_9.pkl')

In [70]:
#predstro_train.to_pickle('predstro_train_11_9.pkl')

- `dev`

In [71]:
#X_dev.to_pickle('X_dev_PREPOST_UNPROC_11_9.pkl')

In [72]:
#y_dev.to_pickle('y_dev_PREPOST_UNPROC_11_9.pkl')

In [73]:
#predstro_dev.to_pickle('predstro_dev_11_9.pkl')

- `test`

In [74]:
#X_test.to_pickle('X_test_PREPOST_UNPROC_11_9.pkl')

In [75]:
#y_test.to_pickle('y_test_PREPOST_UNPROC_11_9.pkl')

In [76]:
#predstro_test.to_pickle('predstro_test_11_9.pkl')