## Capstone Project

### Pre-Operating Features Cleaning and Encoding

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None

#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.23.4
Running numpy version: 1.14.2
Running sklearn version: 0.20.2


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/data'

In [7]:
sorted(os.listdir())

['.DS_Store',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'capstone_STS_risk_factor_features.xlsx',
 'capstone_cleaned_data.csv',
 'capstone_data-version-2.xlsx',
 'capstone_data.xlsx',
 'capstone_data_binarized_outcome.pkl',
 'capstone_data_binarized_outcome.xlsx',
 'capstone_data_binarized_outcome_compressed.pkl',
 'capstone_data_filled_in_complication_data.xlsx',
 'capstone_data_key_variable_nulls_cleaned.pkl',
 'capstone_data_key_variable_nulls_cleaned.xlsx',
 'capstone_data_key_variable_nulls_cleaned_REF.pkl',
 'capstone_data_key_variable_nulls_cleaned_compressed.pkl',
 'pre_op_features.pkl',
 'pre_op_features_A.pkl',
 'pre_op_features_B.pkl',
 'pre_op_features_tree.pkl',
 'pre_op_features_tree_A.pkl',
 'pre_op_features_tree_B.pkl']

#### Loading Dataset
- takes about 90 seconds when using `pd.read_excel`

In [8]:
# raw_data = pd.read_excel('capstone_data_binarized_outcome.xlsx')

#### Pickling the file for faster access - saving `raw_data` as a `.pkl` File
- `pd.read_pickle("Filename.pkl")`

In [9]:
# raw_data.to_pickle('capstone_data_binarized_outcome.pkl')

#### Reading from `.pkl` file for speed

In [10]:
raw_data = pd.read_pickle('capstone_data_binarized_outcome.pkl')

In [11]:
raw_data.head()

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0
1,2,65,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-02,2011-07-09,175.3,79.4,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,45.0,1.2,,,3.0,1.0,,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,1.0,2,,1.0,2.0,2,2.0,,,,,,,,,,1.0,2.0,2.0,2.0,4.0,,,,1.0,55.0,,44.0,32.0,1.0,40.0,1.0,2.0,,,,,,,,,,,3.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,10.0,,,,,,,10.0,,,,,,,1.0,3.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-02,2011-07-03,,,,,32.0,,29.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,4.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,3.0,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.1,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,,0.017,0.069,0
2,3,83,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-04,2011-07-12,162.60001,102.1,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,,29.0,1.2,3.3,6.2,3.0,1.0,8.6,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,1.0,2,,2.0,,1,2.0,,,,,,,,,,2.0,1.0,1.0,2.0,4.0,,,,1.0,60.0,,31.0,50.0,1.0,36.0,1.0,1.0,,1.5,16.0,,,,,,,,3.0,2.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-04,2011-07-04,,,,,,,,,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,2.0,1.0,,2.0,2.0,,,,,1.4,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,,0.045,0.148,0
3,4,59,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-05,2011-07-09,160.0,127.5,1.0,4.0,2.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,,2.0,2.0,2.0,,,,,,,,,35.0,0.9,3.5,7.4,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,1.0,2.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,2.0,,,,1.0,60.0,,33.0,51.0,1.0,35.0,2.0,,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-05,2011-07-05,,,,,34.8,,19.0,,3,,,2.0,1.0,2.0,,2.0,73.0,2.0,,,,,,2.0,47.0,3.0,1.0,2.0,2.0,2.0,,,,,1.0,2.0,0.0,0.0,0.0,,,2.0,2.0,1.0,1.0,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.013,0.074,0
4,5,72,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-06,2011-07-10,160.0,64.0,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,37.0,0.9,3.8,5.7,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,60.0,,21.0,40.0,1.0,40.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,34.6,,19.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,,1.0,1.0,0.0,0.0,0.0,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,0.8,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.016,0.019,0


### Categorical Variable Levels

In [12]:
print(raw_data['gender'].unique())
print(raw_data['racecaucasian'].unique())
print(raw_data['raceblack'].unique())
print(raw_data['raceasian'].unique())
print(raw_data['racenativeam'].unique())
print(raw_data['racnativepacific'].unique())
print(raw_data['raceother'].unique())
print(raw_data['ethnicity'].unique())

[ 1.  2. nan]
[ 1.  2. nan]
[ 2.  1. nan]
[ 2.  1. nan]
[ 2.  1. nan]
[ 2. nan  1.]
[ 2. nan  1.]
[ 2.  1. nan  3.]


#### Testing Replacement Code

- `df['col_name'].replace({replacement_dict})`
- replacement_dict `{old_value_1: new_value_1, old_value_2: new_value_2, np.nan: new_value_3}`

In [13]:
test_df = raw_data.copy()

In [14]:
test_df['ethnicity'] = test_df['ethnicity'].replace({1: 1, 2: 0, np.nan: 0})

In [15]:
# confirming replacement code worked
print(raw_data['ethnicity'].unique())
print(test_df['ethnicity'].unique())

[ 2.  1. nan  3.]
[0. 1. 3.]


In [16]:
# confirming that original data types retained after re-coding
print(raw_data['ethnicity'].dtype)
print(test_df['ethnicity'].dtype)

float64
float64


### More Categorical Levels

In [17]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [18]:
print(sorted(raw_data['diabetes'].unique()))
print(sorted(raw_data['diabctrl'].unique()))
print(sorted(raw_data['dyslip'].unique()))
print(sorted(raw_data['dialysis'].unique()))
print(sorted(raw_data['hypertn'].unique()))

[1.0, 2.0, nan, 3.0]
[1.0, nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
[1.0, 2.0, nan, 3.0]
[1.0, 2.0, nan, 3.0]
[1, 2, 3]


- `hypertn` had no `NaN`s

### `infendo` and `infendty`

In [19]:
print(sorted(raw_data['infendo'].unique()))
print(sorted(raw_data['infendty'].unique()))

[1.0, 2.0, nan]
[nan, 1.0, 2.0]


In [20]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [21]:
print(sorted(raw_data['TobaccoUse'].unique()))
print(sorted(raw_data['chrlungd'].unique()))
print(sorted(raw_data['hmo2'].unique()))
print(sorted(raw_data['slpapn'].unique()))
print(sorted(raw_data['ivdrugab'].unique()))
print(sorted(raw_data['alcohol'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
[1.0, 2.0, 3.0, 4.0, nan, 5.0, 6.0]
[1.0, 2.0, 3.0, 4.0, nan, 5.0]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, 4.0, 5.0, nan]
[1.0, 2.0, 3.0, nan, 4.0, 5.0]


- should we put the text in the `replacement_dict` where there there are a lot (>3) levels that will be converted to dummies such that column names will be human readable??

In [22]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [23]:
print(sorted(raw_data['liverdis'].unique()))
print(sorted(raw_data['immsupp'].unique()))
print(sorted(raw_data['mediastrad'].unique()))
print(sorted(raw_data['cancer'].unique()))
print(sorted(raw_data['pvd'].unique()))
print(sorted(raw_data['ThAoDisease'].unique()))
print(sorted(raw_data['syncope'].unique()))
print(sorted(raw_data['unrespstat'].unique()))
print(sorted(raw_data['cvd'].unique()))

[1.0, 2.0, nan, 3.0]
[1, 2, 3]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, nan, 3.0]
[nan, 1.0, 2.0, 3.0]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, nan]


- `immsupp` has no `NaN`s

In [24]:
print(sorted(raw_data['cva'].unique()))
print(sorted(raw_data['cvawhen'].unique()))

[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0, 4.0]


In [25]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [26]:
print(sorted(raw_data['cvdtia'].unique()))
print(sorted(raw_data['cvdcarsten'].unique()))
print(sorted(raw_data['cvdstenrt'].unique()))
print(sorted(raw_data['cvdstenlft'].unique()))
print(sorted(raw_data['cvdpcarsurg'].unique()))

[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[nan, 1.0, 2.0]


In [27]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [28]:
print(sorted(raw_data['hitanti'].unique()))
print(sorted(raw_data['cigsmoker'].unique()))
print(sorted(raw_data['cigsmokercurr'].unique()))
print(sorted(raw_data['prcvint'].unique()))
print(sorted(raw_data['prcab'].unique()))
print(sorted(raw_data['prvalve'].unique()))

[1.0, 2.0, 3.0, nan]
[1.0, 2.0, nan]
[1.0, nan, 2.0]
[1.0, 2.0, nan, 3.0]
[nan, 1.0, 2.0]
[nan, 1.0, 2.0]


- testing to see if you can use a `replacement_dict` with keys `1, 2, 3, np.nan` for columns that are only `1,2, np.nan`

In [29]:
# `cigsmoker` only has 1=YES, 2=NO and NaN
test_df['cigsmoker'] = test_df['cigsmoker'].replace({1: 1, 2: 0, 3: 0, np.nan: 0})

In [30]:
# confirming replacement code worked
print(raw_data['cigsmoker'].unique())
print(test_df['cigsmoker'].unique())

[ 1.  2. nan]
[1. 0.]


In [31]:
# confirming that original data types retained after re-coding
print(raw_data['cigsmoker'].dtype)
print(test_df['cigsmoker'].dtype)

float64
float64


#### Good - can use the same `replacement_dict` for all `yes, no` and `yes, no, unknown` categoricals

In [32]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [33]:
print(sorted(raw_data['CardSympTimeOfAdm'].unique()))
print(sorted(raw_data['CardSympTimeOfSurg'].unique()))
print(sorted(raw_data['anginalclass'].unique()))
print(sorted(raw_data['chf'].unique()))
print(sorted(raw_data['classnyh'].unique()))
print(sorted(raw_data['priorhf'].unique()))
print(sorted(raw_data['carshock'].unique()))
print(sorted(raw_data['resusc'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
[1.0, 2.0, 3.0, 4.0, 5.0, nan]
[1.0, 2.0, nan, 3.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[1.0, 2.0, nan, 3.0]
[1.0, 2.0, 3.0, 4.0, nan]
[1.0, 2.0, 3.0, 4.0, nan]


In [34]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [35]:
print(sorted(raw_data['Arrhythmia'].unique()))
print(sorted(raw_data['ArrhythAFlutter'].unique()))
print(sorted(raw_data['ArrhythAFib'].unique()))
print(sorted(raw_data['ArrhythAFibDur'].unique()))
print(sorted(raw_data['arrhythwhen'].unique()))
print(sorted(raw_data['arrhyafib'].unique()))
print(sorted(raw_data['arrhyafibty'].unique()))

[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0]
[1.0, 2.0, 3.0, nan]
[nan, 1.0, 2.0]
[nan, 1.0, 2.0]


In [36]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [37]:
print(sorted(raw_data['medasa'].unique()))
print(sorted(raw_data['medaplt5days'].unique()))
print(sorted(raw_data['medinotr'].unique()))
print(sorted(raw_data['medlipid'].unique()))
print(sorted(raw_data['numdisv'].unique()))
print(sorted(raw_data['hdefd'].unique()))

[1.0, 2.0, 3.0, 4.0, nan]
[1, 2, 3, 4]
[1.0, 2.0, nan]
[1.0, 2.0, nan, 3.0, 4.0]
[1.0, 2.0, 3.0, 4.0, nan]
[1.0, 2.0, nan]


In [38]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [39]:
print(sorted(raw_data['vdaort'].unique()))
print(sorted(raw_data['vdstena'].unique()))
print(sorted(raw_data['vdinsufm'].unique()))
print(sorted(raw_data['vdstenm'].unique()))
print(sorted(raw_data['vdinsuft'].unique()))
print(sorted(raw_data['incidenc'].unique()))
print(sorted(raw_data['status'].unique()))

[1.0, 2.0, nan]
[1.0, 2.0, nan]
[0.0, 1.0, 2.0, 3.0, 4.0, nan, 5.0]
[2.0, nan, 1.0]
[2.0, 3.0, nan, 0.0, 1.0, 4.0, 5.0]
[1.0, 2.0, 3.0, 4.0, 5.0, nan]
[1.0, 2.0, 3.0, nan, 4.0]


### STS Model Evaluation - `predstro` and outcome vector `strokeBin`
- if `predstro` `>` `0.5` then model predicts `stroke` else `no stroke`

In [40]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [41]:
raw_data['predstro'].value_counts()

0.007    2936
0.006    2851
0.005    2691
0.008    2594
0.009    2520
0.004    2298
0.010    2234
0.011    1961
0.012    1807
0.013    1633
0.014    1480
0.003    1434
0.015    1393
0.016    1252
0.017    1136
0.018    1058
0.019     968
0.020     879
0.021     851
0.022     705
0.023     657
0.024     651
0.002     535
0.025     516
0.026     493
0.027     457
0.028     439
0.029     340
0.030     333
0.031     306
0.032     269
0.033     237
0.034     226
0.036     215
0.035     195
0.038     176
0.037     174
0.039     144
0.040     139
0.041     113
0.043     107
0.044     102
0.042      99
0.045      74
0.046      74
0.051      67
0.047      63
0.048      62
0.049      51
0.050      49
0.052      45
0.053      44
0.001      42
0.055      38
0.054      34
0.062      28
0.059      27
0.058      26
0.063      25
0.060      24
0.056      24
0.057      23
0.061      21
0.067      20
0.065      19
0.064      17
0.069      16
0.068      15
0.070      14
0.066      14
0.075      11
0.088 

In [42]:
raw_data['predstro'].isnull().sum()

3

- what rows are the `predstro` nulls in - compare to rows of key variables below

In [43]:
raw_data[raw_data['predstro'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
29564,29565,71,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-12-28,2016-01-16,185.0,82.0,2.0,,2.0,2.0,2,2.0,,5.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,14.1,40.9,1.02,4.0,4.8,3.0,1.1,7.66,,,,,,1.0,2.0,2.0,,,,,1.0,20.0,1.0,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,,2.0,2.0,,1.0,,2.0,,1.0,45.0,1.0,34.0,52.0,1.0,46.0,2.0,,,,,,,,,,,,4.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2015-12-28,2015-12-28,,,,,34.0,3.0,31.0,199.0,3,,,2.0,1.0,2.0,2.0,2.0,337.0,2.0,,,,,,2.0,312.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,3.0,2.0,2.0,1.0,261.0,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,1.0,2.0,,2.0,140.0,1.2,,,2.0,,1.0,1.0,2.0,1.0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,1.0,2016-01-21,999.0,,,,,,,0
29703,29704,68,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-02-19,2016-02-23,177.0,94.0,2.0,,2.0,2.0,2,2.0,,5.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,3.0,,3.0,2.0,14.2,41.5,0.91,3.7,5.5,3.0,1.0,6.4,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,2.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,2.0,2.0,1.0,,2.0,,1.0,60.0,1.0,32.0,45.0,2.0,,1.0,1.0,1.0,0.4,61.0,5.0,2.0,,,,,,1.0,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2016-02-19,2016-02-19,,,,,32.0,3.0,29.0,228.0,3,,,2.0,1.0,2.0,2.0,2.0,134.0,2.0,,,,,,2.0,109.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,132.0,1.0,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0
29991,29992,54,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-06-24,2016-06-28,182.8,75.1,2.0,,1.0,2.0,1,2.0,,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,3.0,,2.0,16.1,45.2,1.06,3.6,5.7,3.0,1.0,6.96,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,1,2.0,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,,2.0,,1.0,55.0,1.0,32.0,43.0,1.0,27.8,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,2.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-06-24,2016-06-24,,,,,35.0,3.0,29.0,151.0,3,,,2.0,1.0,2.0,2.0,2.0,113.0,2.0,,,,,,2.0,91.0,2.0,1.0,2.0,2.0,3.0,,,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,148.0,1.1,,,2.0,,2.0,2.0,2.0,1.0,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0


- `predstro` `NaN`s correspond to the rows where `gender` is `NaN`

-  will see if `predstro` `NaN`s are eliminated in process of cleaning key variables of `NaN`s - see below
- checking `strokeBin`

In [44]:
raw_data['strokeBin'].value_counts()

0    42129
1      617
Name: strokeBin, dtype: int64

In [45]:
raw_data['strokeBin'].isnull().sum()

0

### Cleaning and Recoding Functions Pseudocode

#### Recoding `2-3` Level Categoricals `(YES=1, NO=0, np.nan = 0)`

- create a list of columns you want to recode
- use to mask main dataframe - arguments to the function will be column list (`col_list`) and main dataset `dataframe`
- iterate through the list of columns and apply replacement dictionary
- for `column` in `col_list`
- `masked_dataframe[column] = masked_dataframe[column].replace({1: 1, 2: 0, np.nan=0})`
- `return masked_dataframe`

#### Recoding Multi-Level Categoricals ( > 3) and Binarizing
- more complicated, may need to do one at a time
- through the function, need to pass the column name, replacement dictionary(ies) and main dataframe
- have to deal with the `NaN`s -- or incorporate through the numeric codes to text step - directly
- for human readable column headings, need to convert numeric codes to text headings
- then apply `pd.get_dummies()` or use functions come up with last week
- how are you going to avoid the dummy variable trap `k-1`
- can use the `drop_first=True` parameter in `pd.get_dummies()` to get `k-1` dummies out of` k` categorical levels by removing the first level
- can you specify which column is the reference category??
- I think the best way is to run `pd.get_dummies()` without the `drop_first` parameter and then before you return the `dataframe` you specify the reference column to `drop`
- `k-1_dummies_df = full_dummies_df.drop(drop_col, axis=1)` where `drop_col` is a `string` passed as a parameter in your function

### `categorical_to_numeric` Function for reference

### Remember - need two datasets - `cleaned only` and `cleaned AND binarized`
- can keep the categorical levels - just clean it up for `decision trees` as it is better for them that features are not binarized

### Combined Categorical Variable List

#### Raw List

#### Edited

#### Creating `cat_features` List

In [46]:
cat_features = ['gender',
                'racecaucasian',
                'raceblack',
                'raceasian',
                'racenativeam',
                'racnativepacific',
                'ethnicity',
                'diabetes',
                'diabctrl',
                'dyslip',
                'dialysis',
                'hypertn',
                'infendo',
                'infendty',
                'TobaccoUse',
                'chrlungd',
                'hmo2',
                'slpapn',
                'ivdrugab',
                'alcohol',
                'liverdis',
                'immsupp',
                'mediastrad',
                'cancer',
                'pvd',
                'ThAoDisease',
                'syncope',
                'unrespstat',
                'cvd',
                'cva',
                'cvawhen',
                'cvdtia',
                'cvdcarsten',
                'cvdstenrt',
                'cvdstenlft',
                'cvdpcarsurg',
                'hitanti',
                'cigsmoker',
                'cigsmokercurr',
                'prcvint',
                'prcab',
                'prvalve',
                'CardSympTimeOfAdm',
                'CardSympTimeOfSurg',
                'anginalclass',
                'chf',
                'classnyh',
                'priorhf',
                'carshock',
                'resusc',
                'Arrhythmia',
                'ArrhythAFlutter',
                'ArrhythAFib',
                'ArrhythAFibDur',
                'arrhythwhen',
                'arrhyafib',
                'medasa',
                'medaplt5days',
                'medinotr',
                'medlipid',
                'numdisv',
                'hdefd',
                'vdaort',
                'vdstena',
                'vdinsufm',
                'vdstenm',
                'vdinsuft',
                'incidenc',
                'status']

In [47]:
len(cat_features)

69

#### `yes_no_unc` Feature List

In [48]:
yes_no_unc = ['gender',
              'racecaucasian',
              'raceblack',
              'raceasian',
              'racenativeam',
              'racnativepacific',
              'ethnicity',
              'diabetes',
              'dyslip',
              'dialysis',
              'hypertn',
              'infendo',
              'slpapn',
              'liverdis',
              'immsupp',
              'mediastrad',
              'cancer',
              'pvd',
              'ThAoDisease',
              'syncope',
              'unrespstat',
              'cvd',
              'cva',
              'cvdtia',
              'cvdpcarsurg',
              'hitanti',
              'cigsmoker',
              'cigsmokercurr',
              'prcvint',
              'prcab',
              'prvalve',
              'chf',
              'priorhf',
              'Arrhythmia',
              'arrhyafib',
              'medinotr',
              'hdefd',
              'vdaort',
              'vdstena',
              'vdstenm']

In [49]:
len(yes_no_unc)

40

#### `compress_to_two` Feature List

In [50]:
compress_to_two = ['diabctrl',
                   'infendty',
                   'TobaccoUse',
                   'chrlungd',
                   'hmo2',
                   'ivdrugab',
                   'alcohol',
                   'cvawhen',
                   'carshock', # rename`carshock24`
                   'resusc',   # rename `reusc24`
                   'medasa',
                   'medaplt5days',
                   'medlipid',
                   'numdisv']

In [51]:
len(compress_to_two)

14

#### `recode_D` Feature List - Will Be Recoding and Creating Dummies w/Reference Class - No Parent Variable

In [52]:
recode_D = ['CardSympTimeOfAdm',
            'CardSympTimeOfSurg',
            'anginalclass',
            'classnyh',
            'vdinsufm',
            'vdinsuft',
            'incidenc', # rename to 'incidenc_REOP'
            'status']

In [53]:
len(recode_D)

8

#### `recode_D_P` Feature List - Will Be Recoding and Creating Dummies w/Reference Class and w/o Reference Class Due to Parent-Child Relationship

In [54]:
recode_D_P = ['cvdcarsten',      # parent is `cvd`
              'cvdstenrt',       # parent is 'cvd'
              'cvdstenlft',      # parent is 'cvd'
              'ArrhythAFlutter', # parent is 'Arrhythmia'
              'ArrhythAFib',     # parent is 'Arrhythmia'
              'ArrhythAFibDur',  # parent is 'Arrhythmia'
              'arrhythwhen']     # parent is 'Arrhythmia'
                

In [55]:
len(recode_D_P)

7

In [56]:
40 + 14 + 8 + 7

69

#### Numerical Features

In [57]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [58]:
numerical_features = ['recordId', # need to keep even though will not be included in final matrix
                      'age',
                      'heightcm',
                      'weightkg',
                      'hct',
                      'creatlst',
                      'totalbumin',
                      'a1clvl',
                      'meldscr',
                      'hdef',
                      'pasys',
                      'predstro', # going to add the STS Model probability of stroke
                      'strokeBin'] # adding strokeBin to compare STS model prediction to actual outcome

In [59]:
len(numerical_features)

13

#### `datetime` Features

In [60]:
date_features = ['surgdt',
                 'dischdt']

In [61]:
len(date_features)

2

In [62]:
# Ties to excel sheet -- all features accounted for
# when you account for `recordId` and then additions of `predstro` and `strokeBin`

### Creating `pre_op_features`

In [63]:
pre_op_features = numerical_features + date_features + cat_features

In [64]:
pre_op_features

['recordId',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'predstro',
 'strokeBin',
 'surgdt',
 'dischdt',
 'gender',
 'racecaucasian',
 'raceblack',
 'raceasian',
 'racenativeam',
 'racnativepacific',
 'ethnicity',
 'diabetes',
 'diabctrl',
 'dyslip',
 'dialysis',
 'hypertn',
 'infendo',
 'infendty',
 'TobaccoUse',
 'chrlungd',
 'hmo2',
 'slpapn',
 'ivdrugab',
 'alcohol',
 'liverdis',
 'immsupp',
 'mediastrad',
 'cancer',
 'pvd',
 'ThAoDisease',
 'syncope',
 'unrespstat',
 'cvd',
 'cva',
 'cvawhen',
 'cvdtia',
 'cvdcarsten',
 'cvdstenrt',
 'cvdstenlft',
 'cvdpcarsurg',
 'hitanti',
 'cigsmoker',
 'cigsmokercurr',
 'prcvint',
 'prcab',
 'prvalve',
 'CardSympTimeOfAdm',
 'CardSympTimeOfSurg',
 'anginalclass',
 'chf',
 'classnyh',
 'priorhf',
 'carshock',
 'resusc',
 'Arrhythmia',
 'ArrhythAFlutter',
 'ArrhythAFib',
 'ArrhythAFibDur',
 'arrhythwhen',
 'arrhyafib',
 'medasa',
 'medaplt5days',
 'medinotr',
 'medlipid',
 'nu

In [65]:
len(pre_op_features)

84

### Plan for Saturday
- start a new file - save as
- mask `raw_data.copy()` by `pre_op_features`
- recode features - new levels, replacing `NaN`s
- before create `dummies` - recode to new numeric levels for `decision tree` models
- create `dummies`
- rename columns
- put together `feature matrix`
- split by dates in terms of the databases

## Saturday - October 5, 2019

#### Step 1. Delete records where `age` and `gender` are `NaN`

In [66]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [67]:
raw_data.shape

(42746, 409)

#### Checking Key Variables for `NaN`s

In [68]:
print (raw_data['age'].isnull().sum())
print (raw_data['gender'].isnull().sum())
print (raw_data['surgdt'].isnull().sum())
print (raw_data['dischdt'].isnull().sum())
print (raw_data['heightcm'].isnull().sum())
print (raw_data['weightkg'].isnull().sum())
print (raw_data['predstro'].isnull().sum())

0
3
0
0
2
3
3


- going to delete rows where there are `NaN`s in these key features - `gender`, `heightcm` and `weightkg`

In [69]:
raw_data[raw_data['gender'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
29564,29565,71,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-12-28,2016-01-16,185.0,82.0,2.0,,2.0,2.0,2,2.0,,5.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,14.1,40.9,1.02,4.0,4.8,3.0,1.1,7.66,,,,,,1.0,2.0,2.0,,,,,1.0,20.0,1.0,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,,2.0,2.0,,1.0,,2.0,,1.0,45.0,1.0,34.0,52.0,1.0,46.0,2.0,,,,,,,,,,,,4.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2015-12-28,2015-12-28,,,,,34.0,3.0,31.0,199.0,3,,,2.0,1.0,2.0,2.0,2.0,337.0,2.0,,,,,,2.0,312.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,3.0,2.0,2.0,1.0,261.0,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,1.0,2.0,,2.0,140.0,1.2,,,2.0,,1.0,1.0,2.0,1.0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,1.0,2016-01-21,999.0,,,,,,,0
29703,29704,68,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-02-19,2016-02-23,177.0,94.0,2.0,,2.0,2.0,2,2.0,,5.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,3.0,,3.0,2.0,14.2,41.5,0.91,3.7,5.5,3.0,1.0,6.4,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,2.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,2.0,2.0,1.0,,2.0,,1.0,60.0,1.0,32.0,45.0,2.0,,1.0,1.0,1.0,0.4,61.0,5.0,2.0,,,,,,1.0,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2016-02-19,2016-02-19,,,,,32.0,3.0,29.0,228.0,3,,,2.0,1.0,2.0,2.0,2.0,134.0,2.0,,,,,,2.0,109.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,132.0,1.0,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0
29991,29992,54,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-06-24,2016-06-28,182.8,75.1,2.0,,1.0,2.0,1,2.0,,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,3.0,,2.0,16.1,45.2,1.06,3.6,5.7,3.0,1.0,6.96,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,1,2.0,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,,2.0,,1.0,55.0,1.0,32.0,43.0,1.0,27.8,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,2.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-06-24,2016-06-24,,,,,35.0,3.0,29.0,151.0,3,,,2.0,1.0,2.0,2.0,2.0,113.0,2.0,,,,,,2.0,91.0,2.0,1.0,2.0,2.0,3.0,,,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,148.0,1.1,,,2.0,,2.0,2.0,2.0,1.0,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0


In [70]:
raw_data[raw_data['gender'].notnull()].shape

(42743, 409)

In [71]:
raw_data[raw_data['heightcm'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
22182,22183,65,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-04-06,2015-04-08,,,1.0,7.0,3.0,2.0,3,2.0,,6.0,6.0,,5.0,3.0,3.0,3.0,5.0,3.0,3,3.0,3.0,3.0,3.0,3.0,1.0,3.0,,,,,,,,,,,,,3.0,,,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,3.0,,3.0,3.0,4.0,1.0,2.0,3.0,1.0,1.0,,,,,,2.0,2.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,1.0,2.0,2.0,4.0,70.0,2.0,,2.0,,2.0,,,2.0,,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2015-04-06,2015-04-07,,,,,33.7,3.0,22.0,509.0,3,,,2.0,1.0,2.0,2.0,2.0,78.0,2.0,,,,,,2.0,63.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,1.0,4.0,4.0,1.0,0.0,4.0,,1.0,2.0,1.0,1.0,2.0,,,1.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2015-04-08,1.0,1.0,2,1.0,2.0,,,,,,,,,192.0,2.5,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,,NaT,,,,,,0.031,0.107,0
29600,29601,63,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-01-15,2016-01-21,,,1.0,4.0,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,4.0,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,14.4,43.7,1.45,4.0,8.1,3.0,1.0,9.97,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,3.0,3.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,20.0,2.0,,1.0,55.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,3,2,1.0,2.0,2.0,2016-01-15,2016-01-15,,,,,34.0,3.0,29.0,258.0,3,,,2.0,1.0,2.0,2.0,2.0,162.0,2.0,,,,,,2.0,123.0,4.0,1.0,1.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,198.0,1.6,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,1.0,2016-02-01,998.0,,,,,0.026,0.086,0


In [72]:
raw_data[raw_data['weightkg'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
8710,8711,62,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-14,2016-12-22,170.2,,2.0,,1.0,2.0,1,2.0,,5.0,1.0,,2.0,2.0,2.0,5.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,21.1,35.8,1.16,4.7,,3.0,1.08,8.69,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,1.0,,2.0,,1.0,57.5,1.0,40.1,55.0,1.0,37.0,1.0,1.0,1.0,0.5,48.0,5.0,2.0,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,1.0,2.0,2016-12-14,2016-12-14,,,,,36.0,3.0,24.0,147.0,3,,,2.0,1.0,2.0,2.0,2.0,100.0,2.0,,,,,,2.0,68.0,4.0,1.0,2.0,2.0,1.0,,1.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,2.0,2.0,2.0,,,,1.0,2.0,2.0,2.0,,,,,,,,,,,,,1.0,1.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,1.0,,2.0,108.0,1.1,,,2.0,,1.0,1.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,2.0,NaT,,,,,,0.014,0.025,0
22182,22183,65,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-04-06,2015-04-08,,,1.0,7.0,3.0,2.0,3,2.0,,6.0,6.0,,5.0,3.0,3.0,3.0,5.0,3.0,3,3.0,3.0,3.0,3.0,3.0,1.0,3.0,,,,,,,,,,,,,3.0,,,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,3.0,,3.0,3.0,4.0,1.0,2.0,3.0,1.0,1.0,,,,,,2.0,2.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,1.0,2.0,2.0,4.0,70.0,2.0,,2.0,,2.0,,,2.0,,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2015-04-06,2015-04-07,,,,,33.7,3.0,22.0,509.0,3,,,2.0,1.0,2.0,2.0,2.0,78.0,2.0,,,,,,2.0,63.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,1.0,4.0,4.0,1.0,0.0,4.0,,1.0,2.0,1.0,1.0,2.0,,,1.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2015-04-08,1.0,1.0,2,1.0,2.0,,,,,,,,,192.0,2.5,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,,NaT,,,,,,0.031,0.107,0
29600,29601,63,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-01-15,2016-01-21,,,1.0,4.0,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,4.0,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,14.4,43.7,1.45,4.0,8.1,3.0,1.0,9.97,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,3.0,3.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,20.0,2.0,,1.0,55.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,3,2,1.0,2.0,2.0,2016-01-15,2016-01-15,,,,,34.0,3.0,29.0,258.0,3,,,2.0,1.0,2.0,2.0,2.0,162.0,2.0,,,,,,2.0,123.0,4.0,1.0,1.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,198.0,1.6,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,1.0,2016-02-01,998.0,,,,,0.026,0.086,0


- between the key features with `NaN`s - we will be deleting `6` rows

In [73]:
raw_data[(raw_data['gender'].isnull()) | 
         (raw_data['heightcm'].isnull()) | 
         (raw_data['weightkg'].isnull())]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
8710,8711,62,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-14,2016-12-22,170.2,,2.0,,1.0,2.0,1,2.0,,5.0,1.0,,2.0,2.0,2.0,5.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,21.1,35.8,1.16,4.7,,3.0,1.08,8.69,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,1.0,,2.0,,1.0,57.5,1.0,40.1,55.0,1.0,37.0,1.0,1.0,1.0,0.5,48.0,5.0,2.0,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,1.0,2.0,2016-12-14,2016-12-14,,,,,36.0,3.0,24.0,147.0,3,,,2.0,1.0,2.0,2.0,2.0,100.0,2.0,,,,,,2.0,68.0,4.0,1.0,2.0,2.0,1.0,,1.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,2.0,2.0,2.0,,,,1.0,2.0,2.0,2.0,,,,,,,,,,,,,1.0,1.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,1.0,,2.0,108.0,1.1,,,2.0,,1.0,1.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,2.0,NaT,,,,,,0.014,0.025,0
22182,22183,65,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-04-06,2015-04-08,,,1.0,7.0,3.0,2.0,3,2.0,,6.0,6.0,,5.0,3.0,3.0,3.0,5.0,3.0,3,3.0,3.0,3.0,3.0,3.0,1.0,3.0,,,,,,,,,,,,,3.0,,,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,3.0,,3.0,3.0,4.0,1.0,2.0,3.0,1.0,1.0,,,,,,2.0,2.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,1.0,2.0,2.0,4.0,70.0,2.0,,2.0,,2.0,,,2.0,,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2015-04-06,2015-04-07,,,,,33.7,3.0,22.0,509.0,3,,,2.0,1.0,2.0,2.0,2.0,78.0,2.0,,,,,,2.0,63.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,1.0,4.0,4.0,1.0,0.0,4.0,,1.0,2.0,1.0,1.0,2.0,,,1.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2015-04-08,1.0,1.0,2,1.0,2.0,,,,,,,,,192.0,2.5,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,,NaT,,,,,,0.031,0.107,0
29564,29565,71,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-12-28,2016-01-16,185.0,82.0,2.0,,2.0,2.0,2,2.0,,5.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,14.1,40.9,1.02,4.0,4.8,3.0,1.1,7.66,,,,,,1.0,2.0,2.0,,,,,1.0,20.0,1.0,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,,2.0,2.0,,1.0,,2.0,,1.0,45.0,1.0,34.0,52.0,1.0,46.0,2.0,,,,,,,,,,,,4.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2015-12-28,2015-12-28,,,,,34.0,3.0,31.0,199.0,3,,,2.0,1.0,2.0,2.0,2.0,337.0,2.0,,,,,,2.0,312.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,3.0,2.0,2.0,1.0,261.0,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,1.0,2.0,,2.0,140.0,1.2,,,2.0,,1.0,1.0,2.0,1.0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,1.0,2016-01-21,999.0,,,,,,,0
29600,29601,63,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-01-15,2016-01-21,,,1.0,4.0,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,4.0,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,14.4,43.7,1.45,4.0,8.1,3.0,1.0,9.97,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,3.0,3.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,20.0,2.0,,1.0,55.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,3,2,1.0,2.0,2.0,2016-01-15,2016-01-15,,,,,34.0,3.0,29.0,258.0,3,,,2.0,1.0,2.0,2.0,2.0,162.0,2.0,,,,,,2.0,123.0,4.0,1.0,1.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,198.0,1.6,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,1.0,2016-02-01,998.0,,,,,0.026,0.086,0
29703,29704,68,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-02-19,2016-02-23,177.0,94.0,2.0,,2.0,2.0,2,2.0,,5.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,3.0,,3.0,2.0,14.2,41.5,0.91,3.7,5.5,3.0,1.0,6.4,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,2.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,2.0,2.0,1.0,,2.0,,1.0,60.0,1.0,32.0,45.0,2.0,,1.0,1.0,1.0,0.4,61.0,5.0,2.0,,,,,,1.0,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2016-02-19,2016-02-19,,,,,32.0,3.0,29.0,228.0,3,,,2.0,1.0,2.0,2.0,2.0,134.0,2.0,,,,,,2.0,109.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,132.0,1.0,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0
29991,29992,54,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-06-24,2016-06-28,182.8,75.1,2.0,,1.0,2.0,1,2.0,,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,3.0,,2.0,16.1,45.2,1.06,3.6,5.7,3.0,1.0,6.96,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,1,2.0,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,,2.0,,1.0,55.0,1.0,32.0,43.0,1.0,27.8,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,2.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-06-24,2016-06-24,,,,,35.0,3.0,29.0,151.0,3,,,2.0,1.0,2.0,2.0,2.0,113.0,2.0,,,,,,2.0,91.0,2.0,1.0,2.0,2.0,3.0,,,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,148.0,1.1,,,2.0,,2.0,2.0,2.0,1.0,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0


#### Implementing Row Deletion

In [74]:
working_data = raw_data.copy()[(raw_data['gender'].notnull()) & 
                               (raw_data['heightcm'].notnull()) & 
                               (raw_data['weightkg'].notnull())]

- confirming, deletion performed correctly

In [75]:
print (working_data.shape)
print (working_data['gender'].isnull().sum())
print (working_data['heightcm'].isnull().sum())
print (working_data['weightkg'].isnull().sum())
print (working_data['predstro'].isnull().sum())

(42740, 409)
0
0
0
0


- these key `numerical` and `date` features should have no `NaN`s

In [76]:
print (working_data['age'].isnull().sum())
print (working_data['surgdt'].isnull().sum())
print (working_data['dischdt'].isnull().sum())

0
0
0


#### Exporting `working_data` to `Excel` - TAKES TOO LONG - NOT SEEM TO BE WORKING

In [77]:
# working_data.to_excel("capstone_data_key_variable_nulls_cleaned.xlsx")

#### Saving `working_data` as a `.pkl` File
- `pd.read_pickle("Filename.pkl")`

In [78]:
# working_data.to_pickle('capstone_data_key_variable_nulls_cleaned_REF.pkl')

### Step 2. Cleaning and Recoding Variables
- selecting `pre-op` features from main dataset

In [79]:
working_data.head(1) 

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [80]:
working_data.shape

(42740, 409)

In [81]:
working_data['predstro'].describe()

count    42740.00000
mean         0.01455
std          0.01173
min          0.00100
25%          0.00700
50%          0.01100
75%          0.01900
max          0.21300
Name: predstro, dtype: float64

In [82]:
pre_op_df = working_data.copy()[pre_op_features]

In [83]:
pre_op_df.head()

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt,dischdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,2011-07-01,2011-07-06,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,1.0,1.0,2.0,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,1.0,1.0
1,2,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,0.017,0,2011-07-02,2011-07-09,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,2.0,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,2,2.0,2.0,4.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,3.0
2,3,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0,2011-07-04,2011-07-12,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,3.0,2.0,,1.0,2.0,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,1,1.0,1.0,4.0,1.0,1.0,1.0,3.0,2.0,3.0,1.0,2.0
3,4,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0,2011-07-05,2011-07-09,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,2.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,1.0,,2.0,2,2.0,1.0,2.0,1.0,2.0,,4.0,2.0,2.0,1.0,1.0
4,5,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0,2011-07-06,2011-07-10,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,2.0,,,,,1.0,2.0


In [84]:
pre_op_df.shape

(42740, 84)

- making a copy of `pre_op_df` for check if recoding was done correctly

In [85]:
orig_pre = pre_op_df.copy()

In [86]:
orig_pre.shape

(42740, 84)

### Step 2a. Splitting Data by 2.73 and 2.81 Dates and Checking Variables with All `NaN`s
- if feature has all `NaN`s in one of the splits, then you know that particular feature was applicable ONLY to that database

In [87]:
split_dates = pre_op_df.copy()

In [88]:
split_dates.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt,dischdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,2011-07-01,2011-07-06,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,1.0,1.0,2.0,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,1.0,1.0


In [89]:
split_dates.shape

(42740, 84)

In [90]:
v273_df = split_dates[split_dates['surgdt'] < '2014-01-01']

In [91]:
v273_df.shape

(19756, 84)

In [92]:
v281_df = split_dates[split_dates['surgdt'] >= '2014-01-01']

In [93]:
v281_df.shape

(22984, 84)

#### Looking at the number of `NaN`s per feature in 2.73

In [94]:
col_names273 = []
num_nulls273 = []

for column in v273_df.columns:
    col_names273.append(column)
    num_nulls273.append(v273_df[column].isnull().sum())

In [95]:
print(len(col_names273))
print(len(num_nulls273))

84
84


In [96]:
# creating a new data frame for easier analysis

nulls273_df = pd.DataFrame(list(zip(col_names273, num_nulls273)),
                           columns = ['feature', 'number_of_nulls']).sort_values(by=['number_of_nulls'],
                                                                                 ascending=False)

#### `2.73` features with all `NaN`s - conclude that these features only apply to `2.81`

In [97]:
v273_df.shape[0]

19756

In [98]:
nulls273_df[nulls273_df['number_of_nulls'] == v273_df.shape[0]]

Unnamed: 0,feature,number_of_nulls
66,ArrhythAFlutter,19756
58,CardSympTimeOfSurg,19756
57,CardSympTimeOfAdm,19756
29,TobaccoUse,19756
67,ArrhythAFib,19756
40,ThAoDisease,19756
68,ArrhythAFibDur,19756
65,Arrhythmia,19756


#### Looking at the number of `NaN`s per feature in 2.81

In [99]:
col_names281 = []
num_nulls281 = []

for column in v281_df.columns:
    col_names281.append(column)
    num_nulls281.append(v281_df[column].isnull().sum())

In [100]:
print(len(col_names281))
print(len(num_nulls281))

84
84


In [101]:
# creating a new data frame for easier analysis

nulls281_df = pd.DataFrame(list(zip(col_names281, num_nulls281)),
                           columns = ['feature', 'number_of_nulls']).sort_values(by=['number_of_nulls'],
                                                                                 ascending=False)

#### `2.81` features with all `NaN`s - conclude that these features only apply to `2.73`

In [102]:
v281_df.shape[0]

22984

In [103]:
nulls281_df[nulls281_df['number_of_nulls'] == v281_df.shape[0]]

Unnamed: 0,feature,number_of_nulls


- does not appear to be any `2.81` features that only apply to `2.73`, although there are features with a large number of `NaN`s

In [104]:
nulls281_df.head(10)

Unnamed: 0,feature,number_of_nulls
70,arrhyafib,22607
28,infendty,22544
68,ArrhythAFibDur,22198
53,cigsmokercurr,21970
49,cvdstenlft,21290
48,cvdstenrt,21266
45,cvawhen,21180
66,ArrhythAFlutter,19588
67,ArrhythAFib,19585
52,cigsmoker,19141


- going to spot check whether same level of `NaN`s in `2.73`

In [105]:
v273_df.shape[0]

19756

In [106]:
list(nulls281_df.head(10)['feature'])

['arrhyafib',
 'infendty',
 'ArrhythAFibDur',
 'cigsmokercurr',
 'cvdstenlft',
 'cvdstenrt',
 'cvawhen',
 'ArrhythAFlutter',
 'ArrhythAFib',
 'cigsmoker']

In [107]:
for feature in list(nulls281_df.head(30)['feature']):
    print('{0:20} {1:10d} - {2:.1f}'.format(feature, 
                                            v281_df[feature].isnull().sum(),
                                            v281_df[feature].isnull().sum() / v281_df.shape[0] * 100))

arrhyafib                 22607 - 98.4
infendty                  22544 - 98.1
ArrhythAFibDur            22198 - 96.6
cigsmokercurr             21970 - 95.6
cvdstenlft                21290 - 92.6
cvdstenrt                 21266 - 92.5
cvawhen                   21180 - 92.2
ArrhythAFlutter           19588 - 85.2
ArrhythAFib               19585 - 85.2
cigsmoker                 19141 - 83.3
arrhythwhen               19140 - 83.3
classnyh                  18491 - 80.5
cvdcarsten                17915 - 77.9
cvdpcarsurg               17906 - 77.9
cvdtia                    17905 - 77.9
cva                       17904 - 77.9
vdstena                   15551 - 67.7
prvalve                   15303 - 66.6
prcab                     15302 - 66.6
vdstenm                   13336 - 58.0
diabctrl                  13191 - 57.4
pasys                     10716 - 46.6
Arrhythmia                 3849 - 16.7
CardSympTimeOfAdm          3845 - 16.7
TobaccoUse                 3845 - 16.7
ThAoDisease              

In [108]:
for feature in list(nulls281_df.head(30)['feature']):
    print('{0:20} {1:10d} - {2:.1f}'.format(feature, 
                                            v273_df[feature].isnull().sum(),
                                            v273_df[feature].isnull().sum() / v273_df.shape[0] * 100))

arrhyafib                 17553 - 88.8
infendty                  19445 - 98.4
ArrhythAFibDur            19756 - 100.0
cigsmokercurr             14100 - 71.4
cvdstenlft                19502 - 98.7
cvdstenrt                 19466 - 98.5
cvawhen                   18273 - 92.5
ArrhythAFlutter           19756 - 100.0
ArrhythAFib               19756 - 100.0
cigsmoker                     0 - 0.0
arrhythwhen                   0 - 0.0
classnyh                  15929 - 80.6
cvdcarsten                16561 - 83.8
cvdpcarsurg               16515 - 83.6
cvdtia                    16514 - 83.6
cva                       16514 - 83.6
vdstena                   12547 - 63.5
prvalve                   13264 - 67.1
prcab                     13266 - 67.1
vdstenm                    9169 - 46.4
diabctrl                  11596 - 58.7
pasys                      8921 - 45.2
Arrhythmia                19756 - 100.0
CardSympTimeOfAdm         19756 - 100.0
TobaccoUse                19756 - 100.0
ThAoDisease          

- given the percentage differences between `2.81` and `2.73` the features where the definition may have changed are:
- `arrhyafib`, `cigsmokercurr`, `cigsmoker`, `arrhythwhen`
- rest of the features with most `NaN`s look similar, but this is only a spot check

#### Formalizing Above
- `2.81`

In [109]:
nan_col_name281 = []
num_nan281 = []
pct_nan281 = []

for feature in list(nulls281_df['feature']):
    nan_col_name281.append(feature)
    num_nan281.append(v281_df[feature].isnull().sum())
    pct_nan281.append(v281_df[feature].isnull().sum() / v281_df.shape[0] * 100)

In [110]:
pct_null281_df = pd.DataFrame(list(zip(nan_col_name281, num_nan281, pct_nan281)),
                              columns = ['feature', 'number_of_nulls_281', 'pct_null_281'])

In [111]:
pct_null281_df.head()

Unnamed: 0,feature,number_of_nulls_281,pct_null_281
0,arrhyafib,22607,98.35973
1,infendty,22544,98.08562
2,ArrhythAFibDur,22198,96.58023
3,cigsmokercurr,21970,95.58824
4,cvdstenlft,21290,92.62966


In [112]:
pct_null281_df.shape

(84, 3)

- `2.73`

In [113]:
nan_col_name273 = []
num_nan273 = []
pct_nan273 = []

for feature in list(nulls281_df['feature']):
    nan_col_name273.append(feature)
    num_nan273.append(v273_df[feature].isnull().sum())
    pct_nan273.append(v273_df[feature].isnull().sum() / v273_df.shape[0] * 100)

In [114]:
pct_null273_df = pd.DataFrame(list(zip(nan_col_name273, num_nan273, pct_nan273)),
                              columns = ['feature', 'number_of_nulls_273', 'pct_null_273'])

In [115]:
pct_null273_df.head()

Unnamed: 0,feature,number_of_nulls_273,pct_null_273
0,arrhyafib,17553,88.84896
1,infendty,19445,98.42579
2,ArrhythAFibDur,19756,100.0
3,cigsmokercurr,14100,71.37072
4,cvdstenlft,19502,98.71431


In [116]:
pct_null273_df.shape

(84, 3)

- joining the 2 `dataframes` on `feature` then taking the delta in `pct_null` to identify problem features

In [117]:
nan_merge_df = pd.merge(pct_null281_df, pct_null273_df, on='feature')

In [118]:
nan_merge_df.head()

Unnamed: 0,feature,number_of_nulls_281,pct_null_281,number_of_nulls_273,pct_null_273
0,arrhyafib,22607,98.35973,17553,88.84896
1,infendty,22544,98.08562,19445,98.42579
2,ArrhythAFibDur,22198,96.58023,19756,100.0
3,cigsmokercurr,21970,95.58824,14100,71.37072
4,cvdstenlft,21290,92.62966,19502,98.71431


In [119]:
nan_merge_df['pct_diff'] = nan_merge_df['pct_null_281'] - nan_merge_df['pct_null_273']

In [120]:
nan_merge_df['abs_pct_diff'] = nan_merge_df['pct_diff'].abs()

In [121]:
nan_merge_df = nan_merge_df.sort_values(by=['abs_pct_diff'],
                                        ascending=False).round(1)

In [122]:
nan_merge_df.head(20)

Unnamed: 0,feature,number_of_nulls_281,pct_null_281,number_of_nulls_273,pct_null_273,pct_diff,abs_pct_diff
9,cigsmoker,19141,83.3,0,0.0,83.3,83.3
26,CardSympTimeOfSurg,3844,16.7,19756,100.0,-83.3,83.3
10,arrhythwhen,19140,83.3,0,0.0,83.3,83.3
23,CardSympTimeOfAdm,3845,16.7,19756,100.0,-83.3,83.3
24,TobaccoUse,3845,16.7,19756,100.0,-83.3,83.3
25,ThAoDisease,3845,16.7,19756,100.0,-83.3,83.3
22,Arrhythmia,3849,16.7,19756,100.0,-83.3,83.3
30,vdinsuft,1737,7.6,10080,51.0,-43.5,43.5
31,vdinsufm,1463,6.4,8704,44.1,-37.7,37.7
3,cigsmokercurr,21970,95.6,14100,71.4,24.2,24.2


In [123]:
nan_merge_df[nan_merge_df['feature'] == 'ArrhythAFibDur']

Unnamed: 0,feature,number_of_nulls_281,pct_null_281,number_of_nulls_273,pct_null_273,pct_diff,abs_pct_diff
2,ArrhythAFibDur,22198,96.6,19756,100.0,-3.4,3.4


- the `problem` columns are the ones where the number of `NaN`s is 100% in one database (in this case all were in `2.73`), but not the other - these features apply ONLY to one version of the database:

- `TobaccoUse`, `ThAoDisease`. `CardSympTimeOfAdm`, `CardSympTimeOfSurg`, `Arrhythmia`, `ArrhythAFlutter`, `ArrhythAFib`

- in addition, where is there is a large disparity in percentage of `NaN`s between the two databases, this may indicate that the definition of that condition has changed between those two databases
- arbitrarily defined large as a 10% difference (excluding the 100% `NaN` case above) - see `.csv` file for more details
- potentially problematic features are:
- `cigsmoker`, `arrhythwhen`, `vdinsuft`, `vdinsufm`, `cigsmokercurr`, `vdstenm`, `arrhyafib`

- exporting to `.csv` file

In [124]:
nan_merge_df.to_csv('273_vs_281_null_count_by_feature.csv')

### Recoding `Y/N/U` Features

- `yes_no_unc` Feature List

In [125]:
yes_no_unc[0:5]

['gender', 'racecaucasian', 'raceblack', 'raceasian', 'racenativeam']

- going to iterate through the list to recode the columns

In [126]:
for column in yes_no_unc:
    pre_op_df[column] = pre_op_df[column].replace({1: 1, 2: 0, 3: 0, np.nan: 0})

- there should be no `NaN`s

In [127]:
pre_op_df[yes_no_unc].isnull().sum()

gender              0
racecaucasian       0
raceblack           0
raceasian           0
racenativeam        0
racnativepacific    0
ethnicity           0
diabetes            0
dyslip              0
dialysis            0
hypertn             0
infendo             0
slpapn              0
liverdis            0
immsupp             0
mediastrad          0
cancer              0
pvd                 0
ThAoDisease         0
syncope             0
unrespstat          0
cvd                 0
cva                 0
cvdtia              0
cvdpcarsurg         0
hitanti             0
cigsmoker           0
cigsmokercurr       0
prcvint             0
prcab               0
prvalve             0
chf                 0
priorhf             0
Arrhythmia          0
arrhyafib           0
medinotr            0
hdefd               0
vdaort              0
vdstena             0
vdstenm             0
dtype: int64

- checking recoding against original in `orig_pre`

In [128]:
feature_name = []
orig_coding = []
new_coding = []

for column in yes_no_unc:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_df[column].unique()))

In [129]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,gender,"[1.0, 2.0]","[0.0, 1.0]"
1,racecaucasian,"[1.0, 2.0, nan]","[0.0, 1.0]"
2,raceblack,"[1.0, 2.0, nan]","[0.0, 1.0]"
3,raceasian,"[1.0, 2.0, nan]","[0.0, 1.0]"
4,racenativeam,"[1.0, 2.0, nan]","[0.0, 1.0]"
5,racnativepacific,"[2.0, nan, 1.0]","[0.0, 1.0]"
6,ethnicity,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"
7,diabetes,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"
8,dyslip,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"
9,dialysis,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"


In [130]:
len(yes_no_unc)

40

### Recoding `compress_to_two` Features

In [131]:
compress_to_two

['diabctrl',
 'infendty',
 'TobaccoUse',
 'chrlungd',
 'hmo2',
 'ivdrugab',
 'alcohol',
 'cvawhen',
 'carshock',
 'resusc',
 'medasa',
 'medaplt5days',
 'medlipid',
 'numdisv']

- creating `list` of `replacement_dicts`

In [132]:
replacement_dicts = [{1: 0, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0, 7: 0, np.nan: 0}, #diabctrl
                     {1: 0, 2: 1, 3: 0, np.nan: 0}, #infendty
                     {1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, np.nan: 0}, #TobaccoUse
                     {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, 6: 0, np.nan: 0}, #chrlungd
                     {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, np.nan: 0}, #hmo2
                     {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, np.nan: 0}, #ivdrugab
                     {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, np.nan: 0}, #alcohol
                     {1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}, #cvawhen
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #carshock -- RENAME to `carshock24`
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #resusc -- RENAME to `resusc24`
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medasa
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medaplt5days
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medlipid
                     {1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}] #numdisv

In [133]:
print (len(compress_to_two))
print (len(replacement_dicts))

14
14


- since we need to rename columns, before recoding, want to keep orignal coding for auditing purposes

In [134]:
pre_op_df.shape

(42740, 84)

In [135]:
pre_op_df['carshock_orig'] = pre_op_df['carshock']
pre_op_df['resusc_orig'] = pre_op_df['resusc']

In [136]:
# added two columns
pre_op_df.shape

(42740, 86)

- now recoding the features in `compress_to_two`

In [137]:
name_replacement_zip = list(zip(compress_to_two, replacement_dicts))

In [138]:
name_replacement_zip

[('diabctrl', {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0, 7: 0, nan: 0}),
 ('infendty', {1: 0, 2: 1, 3: 0, nan: 0}),
 ('TobaccoUse', {1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, nan: 0}),
 ('chrlungd', {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, 6: 0, nan: 0}),
 ('hmo2', {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, nan: 0}),
 ('ivdrugab', {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, nan: 0}),
 ('alcohol', {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, nan: 0}),
 ('cvawhen', {1: 0, 2: 1, 3: 1, 4: 1, nan: 0}),
 ('carshock', {1: 0, 2: 0, 3: 1, 4: 1, nan: 0}),
 ('resusc', {1: 0, 2: 0, 3: 1, 4: 1, nan: 0}),
 ('medasa', {1: 1, 2: 0, 3: 0, 4: 0, nan: 0}),
 ('medaplt5days', {1: 1, 2: 0, 3: 0, 4: 0, nan: 0}),
 ('medlipid', {1: 1, 2: 0, 3: 0, 4: 0, nan: 0}),
 ('numdisv', {1: 0, 2: 1, 3: 1, 4: 1, nan: 0})]

- iterate through `name_replacement_zip` and apply `replacement_dicts` to features in `compress_to_two`

In [139]:
for column, dictionary in name_replacement_zip:
    pre_op_df[column] = pre_op_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [140]:
feature_name = []
orig_coding = []
new_coding = []

for column in compress_to_two:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_df[column].unique()))

In [141]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,diabctrl,"[1.0, nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[0.0, 1.0]"
1,infendty,"[nan, 1.0, 2.0]","[0.0, 1.0]"
2,TobaccoUse,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]","[0.0, 1.0]"
3,chrlungd,"[1.0, 2.0, 3.0, 4.0, nan, 5.0, 6.0]","[0.0, 1.0]"
4,hmo2,"[1.0, 2.0, 3.0, 4.0, nan, 5.0]","[0.0, 1.0]"
5,ivdrugab,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[0.0, 1.0]"
6,alcohol,"[1.0, 2.0, 3.0, nan, 4.0, 5.0]","[0.0, 1.0]"
7,cvawhen,"[nan, 1.0, 2.0, 3.0, 4.0]","[0.0, 1.0]"
8,carshock,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]"
9,resusc,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]"


### Recoding `recode_D` Features - Will Need to Specify a Reference Class when Create Dummies

In [142]:
recode_D

['CardSympTimeOfAdm',
 'CardSympTimeOfSurg',
 'anginalclass',
 'classnyh',
 'vdinsufm',
 'vdinsuft',
 'incidenc',
 'status']

In [143]:
replacement_dicts_alpha = [{1: 'NONE', 
                            2: 'ANGINA', 
                            3: 'ANGINA', 
                            4: 'STEMI',
                            5: 'STEMI', 
                            6: 'NONE', 
                            7: 'NONE', 
                            np.nan: 'NONE'}, #CardSympTimeOfAdm
                           
                           {1: 'NONE', 
                            2: 'ANGINA', 
                            3: 'ANGINA', 
                            4: 'STEMI',
                            5: 'STEMI', 
                            6: 'NONE', 
                            7: 'NONE', 
                            np.nan: 'NONE'}, #CardSympTimeOfSurg
                           
                           {1: 'NONE', 
                            2: 'SLIGHT', 
                            3: 'SLIGHT', 
                            4: 'REST',
                            5: 'REST', 
                            np.nan: 'NONE'}, #anginalclass
                           
                           {1: 'NONE', 
                            2: 'SLIGHT', 
                            3: 'SLIGHT', 
                            4: 'REST',
                            np.nan: 'NONE'}, #classnyh
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsufm
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsuft
                           
                           {1: 'NONE', 
                            2: 'FIRST', 
                            3: 'SECOND', 
                            4: 'THIRD',
                            5: 'FOURTH', 
                            np.nan: 'NONE'}, #incidenc -- NEED TO RENAME incidence_REOP
                           
                           {1: 'NONE', 
                            2: 'URGENT', 
                            3: 'EMERGENCY', 
                            4: 'SALVAGE',
                            np.nan: 'NONE'}] #status

- don't need to create a numeric analog to the `dictionary` above
- if you use `H2O`'s `Decision Trees` versus `Scikit-learn`, you can run categorical features with text levels without having to convert them to numeric
- use the `include_c=True` parameter per the article `Are Categorical Variables Getting Lost in Your Random Forests`
<p>&nbsp;</p>
- going to work on a subset of `pre_op_df`

In [144]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt,dischdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,2011-07-01,2011-07-06,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


In [145]:
pre_op_df.shape

(42740, 86)

In [146]:
recode_D_df = pre_op_df.copy()[recode_D]

In [147]:
recode_D_df.head()

Unnamed: 0,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidenc,status
0,,,4.0,,4.0,2.0,1.0,1.0
1,,,5.0,,3.0,2.0,1.0,3.0
2,,,1.0,,3.0,3.0,1.0,2.0
3,,,1.0,,4.0,2.0,1.0,1.0
4,,,5.0,,,,1.0,2.0


In [148]:
recode_D_df.shape

(42740, 8)

In [149]:
name_replacement_zip = list(zip(recode_D, replacement_dicts_alpha))

In [150]:
name_replacement_zip

[('CardSympTimeOfAdm',
  {1: 'NONE',
   2: 'ANGINA',
   3: 'ANGINA',
   4: 'STEMI',
   5: 'STEMI',
   6: 'NONE',
   7: 'NONE',
   nan: 'NONE'}),
 ('CardSympTimeOfSurg',
  {1: 'NONE',
   2: 'ANGINA',
   3: 'ANGINA',
   4: 'STEMI',
   5: 'STEMI',
   6: 'NONE',
   7: 'NONE',
   nan: 'NONE'}),
 ('anginalclass',
  {1: 'NONE', 2: 'SLIGHT', 3: 'SLIGHT', 4: 'REST', 5: 'REST', nan: 'NONE'}),
 ('classnyh', {1: 'NONE', 2: 'SLIGHT', 3: 'SLIGHT', 4: 'REST', nan: 'NONE'}),
 ('vdinsufm',
  {0: 'NONE',
   1: 'TRIVIAL',
   2: 'MILD',
   3: 'MODERATE',
   4: 'SEVERE',
   5: 'NONE',
   nan: 'NONE'}),
 ('vdinsuft',
  {0: 'NONE',
   1: 'TRIVIAL',
   2: 'MILD',
   3: 'MODERATE',
   4: 'SEVERE',
   5: 'NONE',
   nan: 'NONE'}),
 ('incidenc',
  {1: 'NONE', 2: 'FIRST', 3: 'SECOND', 4: 'THIRD', 5: 'FOURTH', nan: 'NONE'}),
 ('status',
  {1: 'NONE', 2: 'URGENT', 3: 'EMERGENCY', 4: 'SALVAGE', nan: 'NONE'})]

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D`

In [151]:
for column, dictionary in name_replacement_zip:
    recode_D_df[column] = recode_D_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [152]:
feature_name = []
orig_coding = []
new_coding = []

for column in recode_D:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(recode_D_df[column].unique()))

In [153]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,CardSympTimeOfAdm,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[ANGINA, NONE, STEMI]"
1,CardSympTimeOfSurg,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[ANGINA, NONE, STEMI]"
2,anginalclass,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[NONE, REST, SLIGHT]"
3,classnyh,"[nan, 1.0, 2.0, 3.0, 4.0]","[NONE, REST, SLIGHT]"
4,vdinsufm,"[0.0, 1.0, 2.0, 3.0, 4.0, nan, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]"
5,vdinsuft,"[2.0, 3.0, nan, 0.0, 1.0, 4.0, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]"
6,incidenc,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[FIRST, FOURTH, NONE, SECOND, THIRD]"
7,status,"[1.0, 2.0, 3.0, nan, 4.0]","[EMERGENCY, NONE, SALVAGE, URGENT]"


#### Creating Dummy Variables from Recoded Features
- going to use a copy to keep the recoded features for `Decision Trees`
- recoded features: `recode_D_df`
- recoded features with `dummy variables`: `recode_D_Dummies`

In [154]:
recode_D_df.head()

Unnamed: 0,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidenc,status
0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE
1,NONE,NONE,REST,NONE,MODERATE,MILD,NONE,EMERGENCY
2,NONE,NONE,NONE,NONE,MODERATE,MODERATE,NONE,URGENT
3,NONE,NONE,NONE,NONE,SEVERE,MILD,NONE,NONE
4,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT


In [155]:
print (len(recode_D))
print (recode_D_df.shape)
print (pre_op_df.shape)

8
(42740, 8)
(42740, 86)


- renaming `incidenc` to `incidencREOP` for the `recode_D_df` for use in `Decision Trees`

In [156]:
recode_D_df = recode_D_df.rename(columns={'incidenc': 'incidencREOP'})

In [157]:
recode_D_df.head(1)

Unnamed: 0,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE


#### Now creating `dummies`

- applying `pd.get_dummies()`

In [158]:
recode_D_Dummies = pd.get_dummies(recode_D_df.copy())

In [159]:
recode_D_Dummies.head()

Unnamed: 0,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_NONE,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_NONE,CardSympTimeOfSurg_STEMI,anginalclass_NONE,anginalclass_REST,anginalclass_SLIGHT,classnyh_NONE,classnyh_REST,classnyh_SLIGHT,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_NONE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_NONE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_NONE,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_NONE,status_SALVAGE,status_URGENT
0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
3,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


In [160]:
print (recode_D_df.shape)
print (recode_D_Dummies.shape)

(42740, 8)
(42740, 31)


- now need to eliminate reference classes

In [161]:
# identifying nan columns to drop
drop_cols = [col for col in recode_D_Dummies.columns if col.endswith('_NONE')]

In [162]:
drop_cols

['CardSympTimeOfAdm_NONE',
 'CardSympTimeOfSurg_NONE',
 'anginalclass_NONE',
 'classnyh_NONE',
 'vdinsufm_NONE',
 'vdinsuft_NONE',
 'incidencREOP_NONE',
 'status_NONE']

In [163]:
len(drop_cols)

8

- dropping the columns

In [164]:
recode_D_Dummies = recode_D_Dummies.drop(drop_cols, axis=1)

In [165]:
recode_D_Dummies.head()

Unnamed: 0,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_REST,anginalclass_SLIGHT,classnyh_REST,classnyh_SLIGHT,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT
0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [166]:
recode_D_Dummies.shape

(42740, 23)

- reordering the columns for readability

In [167]:
list(recode_D_Dummies.columns.values)

['CardSympTimeOfAdm_ANGINA',
 'CardSympTimeOfAdm_STEMI',
 'CardSympTimeOfSurg_ANGINA',
 'CardSympTimeOfSurg_STEMI',
 'anginalclass_REST',
 'anginalclass_SLIGHT',
 'classnyh_REST',
 'classnyh_SLIGHT',
 'vdinsufm_MILD',
 'vdinsufm_MODERATE',
 'vdinsufm_SEVERE',
 'vdinsufm_TRIVIAL',
 'vdinsuft_MILD',
 'vdinsuft_MODERATE',
 'vdinsuft_SEVERE',
 'vdinsuft_TRIVIAL',
 'incidencREOP_FIRST',
 'incidencREOP_FOURTH',
 'incidencREOP_SECOND',
 'incidencREOP_THIRD',
 'status_EMERGENCY',
 'status_SALVAGE',
 'status_URGENT']

In [168]:
new_col_order = ['CardSympTimeOfAdm_ANGINA',
                 'CardSympTimeOfAdm_STEMI',
                 
                 'CardSympTimeOfSurg_ANGINA',
                 'CardSympTimeOfSurg_STEMI',

                 'anginalclass_SLIGHT',
                 'anginalclass_REST',

                 'classnyh_SLIGHT',
                 'classnyh_REST',

                 'vdinsufm_TRIVIAL',
                 'vdinsufm_MILD',
                 'vdinsufm_MODERATE',
                 'vdinsufm_SEVERE',

                 'vdinsuft_TRIVIAL',
                 'vdinsuft_MILD',
                 'vdinsuft_MODERATE',
                 'vdinsuft_SEVERE',

                 'incidencREOP_FIRST',
                 'incidencREOP_SECOND',
                 'incidencREOP_THIRD',
                 'incidencREOP_FOURTH',

                 'status_URGENT',
                 'status_EMERGENCY',
                 'status_SALVAGE']

In [169]:
len(new_col_order)

23

- reordering columns
- syntax tip: if manually specifically column order instead of passing a list `df[['col_a', 'col_c', 'col_b']]`

In [170]:
recode_D_Dummies = recode_D_Dummies[new_col_order]

In [171]:
recode_D_Dummies.head()

Unnamed: 0,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE
0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [172]:
recode_D_Dummies.shape

(42740, 23)

### Recoding `recode_D_P` Features - Dropping Reference Class when Create Dummies

In [173]:
recode_D_P

['cvdcarsten',
 'cvdstenrt',
 'cvdstenlft',
 'ArrhythAFlutter',
 'ArrhythAFib',
 'ArrhythAFibDur',
 'arrhythwhen']

In [174]:
replacement_dicts_alpha = [{1: 'NONE', 
                            2: 'RIGHT', 
                            3: 'LEFT', 
                            4: 'BOTH',
                            np.nan: 'NONE'}, #cvdcarsten
                           
                           {1: '80-99%', 
                            2: '100%', 
                            3: '50%-79%', 
                            4: 'NONE',
                            np.nan: 'NONE'}, #cvdstenrt
                           
                           {1: '80-99%', 
                            2: '100%', 
                            3: '50%-79%', 
                            4: 'NONE',
                            np.nan: 'NONE'}, #cvdstenlft
                           
                           {1: 'NONE', 
                            2: 'REMOTE', 
                            3: 'RECENT', 
                            np.nan: 'NONE'}, #ArrhythAFlutter
                           
                           {1: 'NONE', 
                            2: 'PAROXYSMAL', 
                            3: 'CONTINOUS', 
                            np.nan: 'NONE'}, #ArrhythAFib
                           
                           {1: 'SHORT', 
                            2: 'LONG', 
                            3: 'NONE', 
                            np.nan: 'NONE'}, #ArrhythAFibDur
                           
                           {1: 'SHORT', 
                            2: 'LONG', 
                            3: 'NONE', 
                            np.nan: 'NONE'}] #arrhythwhen

In [175]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt,dischdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,2011-07-01,2011-07-06,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


In [176]:
pre_op_df.shape

(42740, 86)

In [177]:
recode_D_P_df = pre_op_df.copy()[recode_D_P]

In [178]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
0,,,,,,,1.0
1,,,,,,,3.0
2,1.0,,,,,,3.0
3,,,,,,,1.0
4,,,,,,,1.0


In [179]:
recode_D_P_df.shape

(42740, 7)

In [180]:
name_replacement_zip = list(zip(recode_D_P, replacement_dicts_alpha))

In [181]:
name_replacement_zip

[('cvdcarsten', {1: 'NONE', 2: 'RIGHT', 3: 'LEFT', 4: 'BOTH', nan: 'NONE'}),
 ('cvdstenrt', {1: '80-99%', 2: '100%', 3: '50%-79%', 4: 'NONE', nan: 'NONE'}),
 ('cvdstenlft',
  {1: '80-99%', 2: '100%', 3: '50%-79%', 4: 'NONE', nan: 'NONE'}),
 ('ArrhythAFlutter', {1: 'NONE', 2: 'REMOTE', 3: 'RECENT', nan: 'NONE'}),
 ('ArrhythAFib', {1: 'NONE', 2: 'PAROXYSMAL', 3: 'CONTINOUS', nan: 'NONE'}),
 ('ArrhythAFibDur', {1: 'SHORT', 2: 'LONG', 3: 'NONE', nan: 'NONE'}),
 ('arrhythwhen', {1: 'SHORT', 2: 'LONG', 3: 'NONE', nan: 'NONE'})]

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D_P`

In [182]:
for column, dictionary in name_replacement_zip:
    recode_D_P_df[column] = recode_D_P_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [183]:
feature_name = []
orig_coding = []
new_coding = []

for column in recode_D_P:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(recode_D_P_df[column].unique()))

In [184]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,cvdcarsten,"[nan, 1.0, 2.0, 3.0, 4.0]","[BOTH, LEFT, NONE, RIGHT]"
1,cvdstenrt,"[nan, 1.0, 2.0, 3.0, 4.0]","[100%, 50%-79%, 80-99%, NONE]"
2,cvdstenlft,"[nan, 1.0, 2.0, 3.0, 4.0]","[100%, 50%-79%, 80-99%, NONE]"
3,ArrhythAFlutter,"[nan, 1.0, 2.0, 3.0]","[NONE, RECENT, REMOTE]"
4,ArrhythAFib,"[nan, 1.0, 2.0, 3.0]","[CONTINOUS, NONE, PAROXYSMAL]"
5,ArrhythAFibDur,"[nan, 1.0, 2.0, 3.0]","[LONG, NONE, SHORT]"
6,arrhythwhen,"[1.0, 2.0, 3.0, nan]","[LONG, NONE, SHORT]"


#### Creating Dummy Variables from Recoded Features
- going to use a copy to keep the recoded features for `Decision Trees`
- recoded features: `recode_D_P_df`
- recoded features with `dummy variables`: `recode_D_P_Dummies`

In [185]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
0,NONE,NONE,NONE,NONE,NONE,NONE,SHORT
1,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,NONE,NONE,NONE,NONE,NONE,NONE,SHORT
4,NONE,NONE,NONE,NONE,NONE,NONE,SHORT


In [186]:
print (len(recode_D_P))
print (recode_D_P_df.shape)
print (pre_op_df.shape)

7
(42740, 7)
(42740, 86)


#### Now creating `dummies`
- applying `pd.get_dummies()`

In [187]:
recode_D_P_Dummies = pd.get_dummies(recode_D_P_df.copy())

In [188]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_BOTH,cvdcarsten_LEFT,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdstenrt_100%,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_NONE,cvdstenlft_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_NONE,ArrhythAFlutter_NONE,ArrhythAFlutter_RECENT,ArrhythAFlutter_REMOTE,ArrhythAFib_CONTINOUS,ArrhythAFib_NONE,ArrhythAFib_PAROXYSMAL,ArrhythAFibDur_LONG,ArrhythAFibDur_NONE,ArrhythAFibDur_SHORT,arrhythwhen_LONG,arrhythwhen_NONE,arrhythwhen_SHORT
0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1
1,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0
2,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0
3,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1
4,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1


In [189]:
recode_D_P_Dummies.shape

(42740, 24)

#### Getting Rid of Reference Classes

In [190]:
# identifying nan columns to drop
drop_cols = [col for col in recode_D_P_Dummies.columns if col.endswith('_NONE')]

In [191]:
drop_cols

['cvdcarsten_NONE',
 'cvdstenrt_NONE',
 'cvdstenlft_NONE',
 'ArrhythAFlutter_NONE',
 'ArrhythAFib_NONE',
 'ArrhythAFibDur_NONE',
 'arrhythwhen_NONE']

In [192]:
len(drop_cols)

7

In [193]:
recode_D_P_Dummies.shape

(42740, 24)

- dropping the columns

In [194]:
recode_D_P_Dummies = recode_D_P_Dummies.drop(drop_cols, axis=1)

In [195]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_BOTH,cvdcarsten_LEFT,cvdcarsten_RIGHT,cvdstenrt_100%,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenlft_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,ArrhythAFlutter_RECENT,ArrhythAFlutter_REMOTE,ArrhythAFib_CONTINOUS,ArrhythAFib_PAROXYSMAL,ArrhythAFibDur_LONG,ArrhythAFibDur_SHORT,arrhythwhen_LONG,arrhythwhen_SHORT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [196]:
recode_D_P_Dummies.shape

(42740, 17)

- reordering columns for readability

In [197]:
recode_D_P_Dummies.columns.tolist()

['cvdcarsten_BOTH',
 'cvdcarsten_LEFT',
 'cvdcarsten_RIGHT',
 'cvdstenrt_100%',
 'cvdstenrt_50%-79%',
 'cvdstenrt_80-99%',
 'cvdstenlft_100%',
 'cvdstenlft_50%-79%',
 'cvdstenlft_80-99%',
 'ArrhythAFlutter_RECENT',
 'ArrhythAFlutter_REMOTE',
 'ArrhythAFib_CONTINOUS',
 'ArrhythAFib_PAROXYSMAL',
 'ArrhythAFibDur_LONG',
 'ArrhythAFibDur_SHORT',
 'arrhythwhen_LONG',
 'arrhythwhen_SHORT']

In [198]:
new_col_order = ['cvdcarsten_RIGHT',
                 'cvdcarsten_LEFT', 
                 'cvdcarsten_BOTH',

                 'cvdstenrt_50%-79%',
                 'cvdstenrt_80-99%',
                 'cvdstenrt_100%',

                 'cvdstenlft_50%-79%',
                 'cvdstenlft_80-99%',
                 'cvdstenlft_100%',

                 'ArrhythAFlutter_REMOTE',
                 'ArrhythAFlutter_RECENT',

                 'ArrhythAFib_PAROXYSMAL',
                 'ArrhythAFib_CONTINOUS',

                 'ArrhythAFibDur_SHORT',
                 'ArrhythAFibDur_LONG',

                 'arrhythwhen_SHORT',
                 'arrhythwhen_LONG']

In [199]:
len(new_col_order)

17

- reordering columns

In [200]:
recode_D_P_Dummies = recode_D_P_Dummies[new_col_order]

In [201]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [202]:
recode_D_P_Dummies.shape

(42740, 17)

### `datetime` Features - Decided Not to Code `year` because splitting the dataset by `2.73` and `2.81` dates

In [203]:
date_features

['surgdt', 'dischdt']

In [204]:
dates_df = pre_op_df.copy()[date_features]

In [205]:
dates_df.head()

Unnamed: 0,surgdt,dischdt
0,2011-07-01,2011-07-06
1,2011-07-02,2011-07-09
2,2011-07-04,2011-07-12
3,2011-07-05,2011-07-09
4,2011-07-06,2011-07-10


In [206]:
dates_df.shape

(42740, 2)

#### Extracting additional features from `surgdt`

In [207]:
def date_components(data, col_labels):
    '''this function extracts date components from datetime objecr and recenters them
       where appropriate
    '''
    dates_frame = data.apply(lambda x: pd.Series([x.month,
                                                  x.day,
                                                  x.weekday()]))
    dates_frame.columns = col_labels
        
    return dates_frame

In [208]:
surgdt_col_labels = ['surgdt_month',
                     'surgdt_DayOfMonth',
                     'surgdt_DayOfWeek']

In [209]:
surgdt_features = date_components(dates_df['surgdt'], surgdt_col_labels)

In [210]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,7,1,4
1,7,2,5
2,7,4,0
3,7,5,1
4,7,6,2


In [211]:
surgdt_features.shape

(42740, 3)

In [212]:
weekday_dict = {0: "Mon",
                1: "Tues",
                2: "Wed",
                3: "Thurs",
                4: "Fri",
                5: "Sat",
                6: "Sun"}

- going to `dummy` code `DayOfWeek`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [213]:
surgdt_features = surgdt_features.replace({'surgdt_DayOfWeek': weekday_dict})

In [214]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,7,1,Fri
1,7,2,Sat
2,7,4,Mon
3,7,5,Tues
4,7,6,Wed


In [215]:
sorted(surgdt_features['surgdt_month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [216]:
month_dict = {1: "Jan",
              2: "Feb",
              3: "Mar",
              4: "Apr",
              5: "May",
              6: "Jun",
              7: "Jul",
              8: "Aug",
              9: "Sep",
              10: "Oct",
              11: "Nov",
              12: "Dec"}

- going to `dummy` code `surgdt_month`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [217]:
surgdt_features = surgdt_features.replace({'surgdt_month': month_dict})

In [218]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,Jul,1,Fri
1,Jul,2,Sat
2,Jul,4,Mon
3,Jul,5,Tues
4,Jul,6,Wed


In [219]:
print (surgdt_features['surgdt_DayOfWeek'].unique())
print (surgdt_features['surgdt_month'].unique())

['Fri' 'Sat' 'Mon' 'Tues' 'Wed' 'Thurs' 'Sun']
['Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec' 'Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun']


- going to `bin` `surgdt_DayOfMonth`

In [220]:
bins = [0, 10, 20, np.inf]
names = ['Beg', 'Mid', 'End']

In [221]:
surgdt_features['surgdt_PartOfMonth'] = pd.cut(surgdt_features['surgdt_DayOfMonth'],
                                               bins,
                                               labels=names)

In [222]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek,surgdt_PartOfMonth
0,Jul,1,Fri,Beg
1,Jul,2,Sat,Beg
2,Jul,4,Mon,Beg
3,Jul,5,Tues,Beg
4,Jul,6,Wed,Beg


In [223]:
# testing binning
surgdt_features[surgdt_features['surgdt_DayOfMonth'] == 31].head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek,surgdt_PartOfMonth
69,Aug,31,Wed,End
135,Oct,31,Mon,End
136,Oct,31,Mon,End
210,Jan,31,Tues,End
211,Jan,31,Tues,End


In [224]:
surgdt_features = surgdt_features.drop('surgdt_DayOfMonth', axis=1)

In [225]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,Jul,Fri,Beg
1,Jul,Sat,Beg
2,Jul,Mon,Beg
3,Jul,Tues,Beg
4,Jul,Wed,Beg


In [226]:
surgdt_features.shape

(42740, 3)

- keep `surgdt_features` for `decision trees`

In [227]:
surgdt_dummies = pd.get_dummies(surgdt_features.copy())

In [228]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Jun,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_Mid,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0


In [229]:
surgdt_dummies.shape

(42740, 22)

- need to drop reference classes for `surgdt_month`, `surgdt_DayOfWeek` and `surgdt_PartOfMonth`
- can pick any month, day of week or Part of Month as the reference class
- chose to pick the middle month (June), day of week (Wed) and Part of Month (Mid) given that most action is around the beginning and end of time periods

In [230]:
drop_cols = ['surgdt_month_Jun', 'surgdt_DayOfWeek_Wed', 'surgdt_PartOfMonth_Mid']

In [231]:
len(drop_cols)

3

In [232]:
surgdt_dummies = surgdt_dummies.drop(drop_cols, axis=1)

In [233]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [234]:
surgdt_dummies.shape

(42740, 19)

- reordering columns for readability

In [235]:
surgdt_dummies.columns.tolist()

['surgdt_month_Apr',
 'surgdt_month_Aug',
 'surgdt_month_Dec',
 'surgdt_month_Feb',
 'surgdt_month_Jan',
 'surgdt_month_Jul',
 'surgdt_month_Mar',
 'surgdt_month_May',
 'surgdt_month_Nov',
 'surgdt_month_Oct',
 'surgdt_month_Sep',
 'surgdt_DayOfWeek_Fri',
 'surgdt_DayOfWeek_Mon',
 'surgdt_DayOfWeek_Sat',
 'surgdt_DayOfWeek_Sun',
 'surgdt_DayOfWeek_Thurs',
 'surgdt_DayOfWeek_Tues',
 'surgdt_PartOfMonth_Beg',
 'surgdt_PartOfMonth_End']

In [236]:
new_col_order = ['surgdt_month_Jan',
                 'surgdt_month_Feb',
                 'surgdt_month_Mar',
                 'surgdt_month_Apr',
                 'surgdt_month_May',
                 'surgdt_month_Jul',
                 'surgdt_month_Aug',
                 'surgdt_month_Sep',
                 'surgdt_month_Oct',
                 'surgdt_month_Nov',
                 'surgdt_month_Dec',

                 'surgdt_DayOfWeek_Mon',
                 'surgdt_DayOfWeek_Tues',
                 'surgdt_DayOfWeek_Thurs',
                 'surgdt_DayOfWeek_Fri',
                 'surgdt_DayOfWeek_Sat',
                 'surgdt_DayOfWeek_Sun',

                 'surgdt_PartOfMonth_Beg',
                 'surgdt_PartOfMonth_End']

In [237]:
surgdt_dummies = surgdt_dummies[new_col_order]

In [238]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [239]:
surgdt_dummies.shape

(42740, 19)

#### Need to decide if want to model excluding reference classes
- can use `df = df.drop([drop_cols], axis=1)`

### Additional `datetime` features from `dischdt`

In [240]:
dates_df.head()

Unnamed: 0,surgdt,dischdt
0,2011-07-01,2011-07-06
1,2011-07-02,2011-07-09
2,2011-07-04,2011-07-12
3,2011-07-05,2011-07-09
4,2011-07-06,2011-07-10


In [241]:
dischdt_features = dates_df.copy()

In [242]:
dischdt_features['dischdt_DayOfWeek'] = dischdt_features['dischdt'].apply(lambda x: pd.Series(x.weekday()))

In [243]:
dischdt_features.head()

Unnamed: 0,surgdt,dischdt,dischdt_DayOfWeek
0,2011-07-01,2011-07-06,2
1,2011-07-02,2011-07-09,5
2,2011-07-04,2011-07-12,1
3,2011-07-05,2011-07-09,5
4,2011-07-06,2011-07-10,6


- going to `dummy` code `DayOfWeek`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [244]:
dischdt_features = dischdt_features.replace({'dischdt_DayOfWeek': weekday_dict})

In [245]:
dischdt_features.head()

Unnamed: 0,surgdt,dischdt,dischdt_DayOfWeek
0,2011-07-01,2011-07-06,Wed
1,2011-07-02,2011-07-09,Sat
2,2011-07-04,2011-07-12,Tues
3,2011-07-05,2011-07-09,Sat
4,2011-07-06,2011-07-10,Sun


In [246]:
dischdt_features['length_stay'] = dischdt_features['dischdt'] - dischdt_features['surgdt']

In [247]:
dischdt_features.head()

Unnamed: 0,surgdt,dischdt,dischdt_DayOfWeek,length_stay
0,2011-07-01,2011-07-06,Wed,5 days
1,2011-07-02,2011-07-09,Sat,7 days
2,2011-07-04,2011-07-12,Tues,8 days
3,2011-07-05,2011-07-09,Sat,4 days
4,2011-07-06,2011-07-10,Sun,4 days


In [248]:
dischdt_features['length_stay'].dtypes

dtype('<m8[ns]')

In [249]:
dischdt_features = dischdt_features.drop(['surgdt', 'dischdt'], axis=1)

In [250]:
dischdt_features.head()

Unnamed: 0,dischdt_DayOfWeek,length_stay
0,Wed,5 days
1,Sat,7 days
2,Tues,8 days
3,Sat,4 days
4,Sun,4 days


In [251]:
dischdt_features.shape

(42740, 2)

- keep `dischdt_features` for `decision trees`


#### Creating `dischdt_dummies`
- `dischdt_DayOfWeek` should be the only feature binarized by `pd.get_dummies`

In [252]:
dischdt_dummies = pd.get_dummies(dischdt_features.copy())

In [253]:
dischdt_dummies.head()

Unnamed: 0,length_stay,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Wed
0,5 days,0,0,0,0,0,0,1
1,7 days,0,0,1,0,0,0,0
2,8 days,0,0,0,0,0,1,0
3,4 days,0,0,1,0,0,0,0
4,4 days,0,0,0,1,0,0,0


In [254]:
dischdt_dummies.shape

(42740, 8)

- dropping `dischdt_DayOfWeek_Wed` as the reference class

In [255]:
dischdt_dummies = dischdt_dummies.drop('dischdt_DayOfWeek_Wed', axis=1)

In [256]:
dischdt_dummies.head()

Unnamed: 0,length_stay,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Tues
0,5 days,0,0,0,0,0,0
1,7 days,0,0,1,0,0,0
2,8 days,0,0,0,0,0,1
3,4 days,0,0,1,0,0,0
4,4 days,0,0,0,1,0,0


In [257]:
dischdt_dummies.shape

(42740, 7)

- reordering columns

In [258]:
dischdt_dummies.columns.tolist()

['length_stay',
 'dischdt_DayOfWeek_Fri',
 'dischdt_DayOfWeek_Mon',
 'dischdt_DayOfWeek_Sat',
 'dischdt_DayOfWeek_Sun',
 'dischdt_DayOfWeek_Thurs',
 'dischdt_DayOfWeek_Tues']

In [259]:
new_col_order = ['length_stay',
 
                 'dischdt_DayOfWeek_Mon',
                 'dischdt_DayOfWeek_Tues',
                 'dischdt_DayOfWeek_Thurs',
                 'dischdt_DayOfWeek_Fri',
                 'dischdt_DayOfWeek_Sat',
                 'dischdt_DayOfWeek_Sun']

In [260]:
dischdt_dummies = dischdt_dummies[new_col_order]

In [261]:
dischdt_dummies.head(1)

Unnamed: 0,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun
0,5 days,0,0,0,0,0,0


In [262]:
dischdt_dummies.shape

(42740, 7)

### `numerical_features`

In [263]:
numerical_features

['recordId',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'predstro',
 'strokeBin']

- going to insert `dischdt` -- purpose will be clear when we have to divide dataset by dates

In [264]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt,dischdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,2011-07-01,2011-07-06,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


In [265]:
pre_op_df.shape

(42740, 86)

In [266]:
numerical_features.insert(1, 'surgdt')

In [267]:
numerical_features

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'predstro',
 'strokeBin']

In [268]:
len(numerical_features)

14

- creating `numerical_feature_df`

In [269]:
numerical_features_df = pre_op_df.copy()[numerical_features]

In [270]:
numerical_features_df.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin
0,1,2011-07-01,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0
1,2,2011-07-02,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,0.017,0
2,3,2011-07-04,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0
3,4,2011-07-05,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0
4,5,2011-07-06,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0


In [271]:
numerical_features_df.shape

(42740, 14)

- creating `bmi` numerical feature
- BMI is weight in kilograms (`weightkg`) divided by height in meters squared `(heightcm/100)^2)`

In [272]:
numerical_features_df['bmi'] = numerical_features_df['weightkg'] / np.power((numerical_features_df['heightcm']/100), 
                                                                            2)

In [273]:
numerical_features_df.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,bmi
0,1,2011-07-01,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,36.11111
1,2,2011-07-02,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,0.017,0,25.83787
2,3,2011-07-04,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0,38.61754
3,4,2011-07-05,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0,49.80469
4,5,2011-07-06,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0,25.0


In [274]:
numerical_features_df.shape

(42740, 15)

- reordering columns

In [275]:
numerical_features_df.columns.tolist()

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'predstro',
 'strokeBin',
 'bmi']

In [276]:
new_col_order = ['recordId',
                 'surgdt',
                 'age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys',
                 'predstro',
                 'strokeBin']

In [277]:
numerical_features_df = numerical_features_df[new_col_order]

In [278]:
numerical_features_df.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0.017,0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0


In [279]:
numerical_features_df.shape

(42740, 15)

## Assembling the Pre-Op Feature Matricies

- `yes_no_unc_df`

In [280]:
yes_no_unc_df = pre_op_df.copy()[yes_no_unc]

In [281]:
yes_no_unc_df.head()

Unnamed: 0,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,chf,priorhf,Arrhythmia,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [282]:
print (yes_no_unc_df.shape)
print (len(yes_no_unc))

(42740, 40)
40


In [283]:
len(yes_no_unc)

40

- `compress_to_two_df`

In [284]:
compress_to_two_df = pre_op_df.copy()[compress_to_two]

In [285]:
compress_to_two_df.head()

Unnamed: 0,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock,resusc,medasa,medaplt5days,medlipid,numdisv
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0


In [286]:
compress_to_two_df.shape

(42740, 14)

- need to rename `carshock` and `resusc` to `carshock24` and `resusc24`

In [287]:
compress_to_two_df = compress_to_two_df.rename(columns={'carshock': 'carshock24',
                                                        'resusc': 'resusc24'})

In [288]:
compress_to_two_df.head()

Unnamed: 0,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0


## `pre_op_X_tree`

In [289]:
pre_op_X_tree = pd.concat((numerical_features_df,
                           surgdt_features,
                           dischdt_features,
                           yes_no_unc_df,
                           compress_to_two_df,
                           recode_D_df,
                           recode_D_P_df),
                          axis=1)

In [290]:
pre_op_X_tree.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,dischdt_DayOfWeek,length_stay,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,chf,priorhf,Arrhythmia,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,Jul,Fri,Beg,Wed,5 days,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,SHORT
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0.017,0,Jul,Sat,Beg,Sat,7 days,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,NONE,NONE,REST,NONE,MODERATE,MILD,NONE,EMERGENCY,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0,Jul,Mon,Beg,Tues,8 days,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0,NONE,NONE,NONE,NONE,MODERATE,MODERATE,NONE,URGENT,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0,Jul,Tues,Beg,Sat,4 days,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,NONE,NONE,NONE,NONE,SEVERE,MILD,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,SHORT
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0,Jul,Wed,Beg,Sun,4 days,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT,NONE,NONE,NONE,NONE,NONE,NONE,SHORT


In [291]:
pre_op_X_tree.shape

(42740, 89)

- reordering columns

In [292]:
pre_op_X_tree.columns.tolist()

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'bmi',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'predstro',
 'strokeBin',
 'surgdt_month',
 'surgdt_DayOfWeek',
 'surgdt_PartOfMonth',
 'dischdt_DayOfWeek',
 'length_stay',
 'gender',
 'racecaucasian',
 'raceblack',
 'raceasian',
 'racenativeam',
 'racnativepacific',
 'ethnicity',
 'diabetes',
 'dyslip',
 'dialysis',
 'hypertn',
 'infendo',
 'slpapn',
 'liverdis',
 'immsupp',
 'mediastrad',
 'cancer',
 'pvd',
 'ThAoDisease',
 'syncope',
 'unrespstat',
 'cvd',
 'cva',
 'cvdtia',
 'cvdpcarsurg',
 'hitanti',
 'cigsmoker',
 'cigsmokercurr',
 'prcvint',
 'prcab',
 'prvalve',
 'chf',
 'priorhf',
 'Arrhythmia',
 'arrhyafib',
 'medinotr',
 'hdefd',
 'vdaort',
 'vdstena',
 'vdstenm',
 'diabctrl',
 'infendty',
 'TobaccoUse',
 'chrlungd',
 'hmo2',
 'ivdrugab',
 'alcohol',
 'cvawhen',
 'carshock24',
 'resusc24',
 'medasa',
 'medaplt5days',
 'medlipid',
 'numdisv',
 'CardSympTimeOfAdm',
 'CardSympTimeOf

In [293]:
new_col_order = ['recordId',
                 'surgdt',
                 'age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys',
                 'predstro',
                 'strokeBin',


                 'surgdt_month',
                 'surgdt_DayOfWeek',
                 'surgdt_PartOfMonth',
                 'dischdt_DayOfWeek',
                 'length_stay',


                 'gender',
                 'racecaucasian',
                 'raceblack',
                 'raceasian',
                 'racenativeam',
                 'racnativepacific',
                 'ethnicity',

                 'diabetes',
                 'diabctrl',

                 'dyslip',
                 'dialysis',
                 'hypertn',

                 'infendo',
                 'infendty',

                 'slpapn',
                 'liverdis',
                 'immsupp',
                 'mediastrad',
                 'cancer',
                 'pvd',
                 'ThAoDisease',
                 'syncope',
                 'unrespstat',
                 'hitanti',


                 'TobaccoUse',
                 'cigsmoker',
                 'cigsmokercurr',
                 'chrlungd',


                 'prcvint',
                 'prcab',
                 'prvalve',
                 'chf',
                 'priorhf',
                 'medinotr',
                 'hdefd',
                 'vdaort',
                 'vdstena',
                 'vdstenm',


                 'hmo2',
                 'ivdrugab',
                 'alcohol',
                 'carshock24',
                 'resusc24',
                 'medasa',
                 'medaplt5days',
                 'medlipid',
                 'numdisv',
                 'CardSympTimeOfAdm',
                 'CardSympTimeOfSurg',
                 'anginalclass',
                 'classnyh',
                 'vdinsufm',
                 'vdinsuft',
                 'incidencREOP',
                 'status',


                 'cvd',
                 'cva',
                 'cvawhen',
                 'cvdtia',
                 'cvdpcarsurg',
                 'cvdcarsten',
                 'cvdstenrt',
                 'cvdstenlft',

                 'Arrhythmia',
                 'arrhyafib',
                 'ArrhythAFlutter',
                 'ArrhythAFib',
                 'ArrhythAFibDur',
                 'arrhythwhen']

In [294]:
len(new_col_order)

89

In [295]:
pre_op_X_tree = pre_op_X_tree[new_col_order]

In [296]:
pre_op_X_tree.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,dischdt_DayOfWeek,length_stay,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvd,cva,cvawhen,cvdtia,cvdpcarsurg,cvdcarsten,cvdstenrt,cvdstenlft,Arrhythmia,arrhyafib,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,Jul,Fri,Beg,Wed,5 days,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0.017,0,Jul,Sat,Beg,Sat,7 days,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,NONE,NONE,REST,NONE,MODERATE,MILD,NONE,EMERGENCY,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0,Jul,Mon,Beg,Tues,8 days,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,1.0,NONE,NONE,NONE,NONE,MODERATE,MODERATE,NONE,URGENT,1.0,1.0,1.0,1.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0,Jul,Tues,Beg,Sat,4 days,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,NONE,NONE,NONE,NONE,SEVERE,MILD,NONE,NONE,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0,Jul,Wed,Beg,Sun,4 days,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT


In [297]:
pre_op_X_tree.shape

(42740, 89)

## `pre_op_X`

In [298]:
pre_op_X = pd.concat((numerical_features_df,
                      surgdt_dummies,
                      dischdt_dummies,
                      yes_no_unc_df,
                      compress_to_two_df,
                      recode_D_Dummies,
                      recode_D_P_Dummies),
                     axis=1)

In [299]:
pre_op_X.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,chf,priorhf,Arrhythmia,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,5 days,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0.017,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,7 days,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,8 days,0,1,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,4 days,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,4 days,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [300]:
pre_op_X.shape

(42740, 135)

- reordering columns

In [301]:
pre_op_X.columns.tolist()

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'bmi',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'predstro',
 'strokeBin',
 'surgdt_month_Jan',
 'surgdt_month_Feb',
 'surgdt_month_Mar',
 'surgdt_month_Apr',
 'surgdt_month_May',
 'surgdt_month_Jul',
 'surgdt_month_Aug',
 'surgdt_month_Sep',
 'surgdt_month_Oct',
 'surgdt_month_Nov',
 'surgdt_month_Dec',
 'surgdt_DayOfWeek_Mon',
 'surgdt_DayOfWeek_Tues',
 'surgdt_DayOfWeek_Thurs',
 'surgdt_DayOfWeek_Fri',
 'surgdt_DayOfWeek_Sat',
 'surgdt_DayOfWeek_Sun',
 'surgdt_PartOfMonth_Beg',
 'surgdt_PartOfMonth_End',
 'length_stay',
 'dischdt_DayOfWeek_Mon',
 'dischdt_DayOfWeek_Tues',
 'dischdt_DayOfWeek_Thurs',
 'dischdt_DayOfWeek_Fri',
 'dischdt_DayOfWeek_Sat',
 'dischdt_DayOfWeek_Sun',
 'gender',
 'racecaucasian',
 'raceblack',
 'raceasian',
 'racenativeam',
 'racnativepacific',
 'ethnicity',
 'diabetes',
 'dyslip',
 'dialysis',
 'hypertn',
 'infendo',
 'slpapn',
 'liverdis',
 'immsupp',
 'mediastr

In [302]:
new_col_order = ['recordId',
                 'surgdt',
                 'age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys',
                 'predstro',
                 'strokeBin',


                 'surgdt_month_Jan',
                 'surgdt_month_Feb',
                 'surgdt_month_Mar',
                 'surgdt_month_Apr',
                 'surgdt_month_May',
                 'surgdt_month_Jul',
                 'surgdt_month_Aug',
                 'surgdt_month_Sep',
                 'surgdt_month_Oct',
                 'surgdt_month_Nov',
                 'surgdt_month_Dec',
                 
                 'surgdt_DayOfWeek_Mon',
                 'surgdt_DayOfWeek_Tues',
                 'surgdt_DayOfWeek_Thurs',
                 'surgdt_DayOfWeek_Fri',
                 'surgdt_DayOfWeek_Sat',
                 'surgdt_DayOfWeek_Sun',
                 'surgdt_PartOfMonth_Beg',
                 'surgdt_PartOfMonth_End',
                 
                 
                 'length_stay',
                 
                 'dischdt_DayOfWeek_Mon',
                 'dischdt_DayOfWeek_Tues',
                 'dischdt_DayOfWeek_Thurs',
                 'dischdt_DayOfWeek_Fri',
                 'dischdt_DayOfWeek_Sat',
                 'dischdt_DayOfWeek_Sun',


                 'gender',
                 'racecaucasian',
                 'raceblack',
                 'raceasian',
                 'racenativeam',
                 'racnativepacific',
                 'ethnicity',


                 'diabetes',
                 'diabctrl',

                 'dyslip',
                 'dialysis',
                 'hypertn',

                 'infendo',
                 'infendty',

                 'slpapn',
                 'liverdis',
                 'immsupp',
                 'mediastrad',
                 'cancer',
                 'pvd',
                 'ThAoDisease',
                 'syncope',
                 'unrespstat',
                 'hitanti',
                 
                 
                 'TobaccoUse',
                 'cigsmoker',
                 'cigsmokercurr',
                 'chrlungd',
                 
                 
                 'prcvint',
                 'prcab',
                 'prvalve',
                 'chf',
                 'priorhf',
                 

                 'medinotr',
                 'hdefd',
                 'vdaort',
                 'vdstena',
                 'vdstenm',
                 

                 'hmo2',
                 'ivdrugab',
                 'alcohol',
                 'cvawhen',
                 'carshock24',
                 'resusc24',
                 'medasa',
                 'medaplt5days',
                 'medlipid',
                 'numdisv',


                 'CardSympTimeOfAdm_ANGINA',
                 'CardSympTimeOfAdm_STEMI',
                 'CardSympTimeOfSurg_ANGINA',
                 'CardSympTimeOfSurg_STEMI',
                 'anginalclass_SLIGHT',
                 'anginalclass_REST',
                 'classnyh_SLIGHT',
                 'classnyh_REST',
                 'vdinsufm_TRIVIAL',
                 'vdinsufm_MILD',
                 'vdinsufm_MODERATE',
                 'vdinsufm_SEVERE',
                 'vdinsuft_TRIVIAL',
                 'vdinsuft_MILD',
                 'vdinsuft_MODERATE',
                 'vdinsuft_SEVERE',
                 'incidencREOP_FIRST',
                 'incidencREOP_SECOND',
                 'incidencREOP_THIRD',
                 'incidencREOP_FOURTH',
                 'status_URGENT',
                 'status_EMERGENCY',
                 'status_SALVAGE',


                 'cvd',
                 'cva',
                 'cvdtia',
                 'cvdpcarsurg',
                 'cvdcarsten_RIGHT',
                 'cvdcarsten_LEFT',
                 'cvdcarsten_BOTH',
                 'cvdstenrt_50%-79%',
                 'cvdstenrt_80-99%',
                 'cvdstenrt_100%',
                 'cvdstenlft_50%-79%',
                 'cvdstenlft_80-99%',
                 'cvdstenlft_100%',


                 'Arrhythmia',
                 'arrhyafib',
                 'ArrhythAFlutter_REMOTE',
                 'ArrhythAFlutter_RECENT',
                 'ArrhythAFib_PAROXYSMAL',
                 'ArrhythAFib_CONTINOUS',
                 'ArrhythAFibDur_SHORT',
                 'ArrhythAFibDur_LONG',
                 'arrhythwhen_SHORT',
                 'arrhythwhen_LONG']

In [303]:
len(new_col_order)

135

In [304]:
pre_op_X = pre_op_X[new_col_order]

In [305]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt,dischdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,2011-07-01,2011-07-06,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


### Open Items
- need to split `pre_op_X` and `pre_op_X_tree` by dates in databases 2.73 (2011 - 2014) and 2.81 (2014 - 2016)
- confirm the date
- drop `surgdt` and `recordId` from all subsets before modelling
- drop any columns that only apply to one of the databases - confirm features
- code to drop columns from a dataframe `df = df.drop(['colA', 'colB'], axis=1)`
- pickle the resulting dataframes for fast reuse
- outcome variable `y`

### Splitting `pre_op_X` and `pre_op_X_tree` by Date

#### `pre_op_X`

In [306]:
pre_op_X.shape

(42740, 135)

- `A` corresponds to 2.73 database and `B` corresponds to 2.81

In [307]:
pre_op_X_A = pre_op_X[pre_op_X['surgdt'] < '2014-01-01']

In [308]:
pre_op_X_A.shape

(19756, 135)

In [309]:
print (pre_op_X_A['surgdt'].min())
print (pre_op_X_A['surgdt'].max())

2011-07-01 00:00:00
2013-12-31 00:00:00


In [310]:
pre_op_X_B = pre_op_X[pre_op_X['surgdt'] >= '2014-01-01']

In [311]:
pre_op_X_B.shape

(22984, 135)

In [312]:
print (pre_op_X_B['surgdt'].min())
print (pre_op_X_B['surgdt'].max())

2014-01-01 00:00:00
2016-12-31 00:00:00


In [313]:
print (pre_op_X_A.shape[0] + pre_op_X_B.shape[0])
print (pre_op_X.shape[0])

42740
42740


#### `pre_op_X_tree`

In [314]:
pre_op_X_tree.shape

(42740, 89)

- `A` corresponds to 2.73 database and `B` corresponds to 2.81

In [315]:
pre_op_X_tree_A = pre_op_X_tree[pre_op_X_tree['surgdt'] < '2014-01-01']

In [316]:
pre_op_X_tree_A.shape

(19756, 89)

In [317]:
print (pre_op_X_tree_A['surgdt'].min())
print (pre_op_X_tree_A['surgdt'].max())

2011-07-01 00:00:00
2013-12-31 00:00:00


In [318]:
pre_op_X_tree_B = pre_op_X_tree[pre_op_X_tree['surgdt'] >= '2014-01-01']

In [319]:
pre_op_X_tree_B.shape

(22984, 89)

In [320]:
print (pre_op_X_tree_B['surgdt'].min())
print (pre_op_X_tree_B['surgdt'].max())

2014-01-01 00:00:00
2016-12-31 00:00:00


In [321]:
print (pre_op_X_tree_A.shape[0] + pre_op_X_tree_B.shape[0])
print (pre_op_X.shape[0])

42740
42740


### Carving Out STS Model Predictions for Evaluation

In [322]:
pre_op_X_A.head(1)

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,5 days,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0


In [323]:
pre_op_X_B.head(1)

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
945,946,2014-01-02,66,175.3,85.0,27.66019,31.0,0.7,4.0,5.3,7.9,55.0,19.2,0.016,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,5 days,0,1,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0


In [324]:
sts_eval_features = ['predstro', 'strokeBin']

#### STS for `2.73`

In [325]:
sts_model_preds_A = pre_op_X_A.copy()[sts_eval_features]

In [326]:
sts_model_preds_A.head(2)

Unnamed: 0,predstro,strokeBin
0,0.014,0
1,0.017,0


In [327]:
print (sts_model_preds_A.shape)
print (pre_op_X_A.shape)

(19756, 2)
(19756, 135)


#### STS for `2.81`

In [328]:
sts_model_preds_B = pre_op_X_B.copy()[sts_eval_features]

In [329]:
sts_model_preds_B.head(2)

Unnamed: 0,predstro,strokeBin
945,0.016,0
946,0.024,0


In [330]:
print (sts_model_preds_B.shape)
print (pre_op_X_B.shape)

(22984, 2)
(22984, 135)


### Pickling Final Files - `DREF` denotes Drop Reference Class

#### For `decision trees`

`pre_op_X_tree`, `pre_op_X_tree_A` and  `pre_op_X_tree_B`

In [331]:
# pre_op_X_tree.to_pickle('pre_op_features_tree_DREF.pkl')

In [332]:
# pre_op_X_tree_A.to_pickle('pre_op_features_tree_A_DREF.pkl')

In [333]:
# pre_op_X_tree_B.to_pickle('pre_op_features_tree_B_DREF.pkl')

#### For All Other Models

`pre_op_X`, `pre_op_X_A` and  `pre_op_X_B`

In [334]:
# pre_op_X.to_pickle('pre_op_features_DREF.pkl')

In [335]:
# pre_op_X_A.to_pickle('pre_op_features_A_DREF.pkl')

In [336]:
# pre_op_X_B.to_pickle('pre_op_features_B_DREF.pkl')

#### For STS Model Evaluation

In [337]:
# sts_model_preds_A.to_pickle('sts_model_evaldata_A.pkl')

In [338]:
# sts_model_preds_B.to_pickle('sts_model_evaldata_B.pkl')