## Capstone Project

### Pre-Operating Features Cleaning and Encoding

#### Importing Libraries

In [12]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None

#for train-dev-test splitting
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler 
#https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.RandomOverSampler.html

#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.23.4
Running numpy version: 1.14.2
Running sklearn version: 0.20.2


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/data'

In [7]:
sorted(os.listdir())

['.DS_Store',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'capstone_STS_risk_factor_features.xlsx',
 'capstone_cleaned_data.csv',
 'capstone_data-version-2.xlsx',
 'capstone_data.xlsx',
 'capstone_data_binarized_outcome.pkl',
 'capstone_data_binarized_outcome.xlsx',
 'capstone_data_filled_in_complication_data.xlsx',
 'capstone_data_key_variable_nulls_cleaned.pkl',
 'capstone_data_key_variable_nulls_cleaned.xlsx',
 'pre_op_features.pkl',
 'pre_op_features_A.pkl',
 'pre_op_features_B.pkl',
 'pre_op_features_tree.pkl',
 'pre_op_features_tree_A.pkl',
 'pre_op_features_tree_B.pkl']

#### Loading Dataset
- takes about 90 seconds when using `pd.read_excel`

In [438]:
raw_data = pd.read_excel('capstone_data_binarized_outcome.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'capstone_data_binarized_outcome.xlsx'

#### Pickling the file for faster access - saving `raw_data` as a `.pkl` File
- `pd.read_pickle("Filename.pkl")`

In [4]:
raw_data.to_pickle('../data/capstone_data_binarized_outcome.pkl')

In [439]:
raw_data = pd.read_pickle('../data/capstone_data_binarized_outcome.pkl')

In [440]:
raw_data.head()

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0
1,2,65,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-02,2011-07-09,175.3,79.4,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,45.0,1.2,,,3.0,1.0,,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,1.0,2,,1.0,2.0,2,2.0,,,,,,,,,,1.0,2.0,2.0,2.0,4.0,,,,1.0,55.0,,44.0,32.0,1.0,40.0,1.0,2.0,,,,,,,,,,,3.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,10.0,,,,,,,10.0,,,,,,,1.0,3.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-02,2011-07-03,,,,,32.0,,29.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,4.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,3.0,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.1,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,,0.017,0.069,0
2,3,83,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-04,2011-07-12,162.60001,102.1,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,,29.0,1.2,3.3,6.2,3.0,1.0,8.6,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,1.0,2,,2.0,,1,2.0,,,,,,,,,,2.0,1.0,1.0,2.0,4.0,,,,1.0,60.0,,31.0,50.0,1.0,36.0,1.0,1.0,,1.5,16.0,,,,,,,,3.0,2.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-04,2011-07-04,,,,,,,,,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,2.0,1.0,,2.0,2.0,,,,,1.4,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,,0.045,0.148,0
3,4,59,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-05,2011-07-09,160.0,127.5,1.0,4.0,2.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,,2.0,2.0,2.0,,,,,,,,,35.0,0.9,3.5,7.4,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,1.0,2.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,2.0,,,,1.0,60.0,,33.0,51.0,1.0,35.0,2.0,,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-05,2011-07-05,,,,,34.8,,19.0,,3,,,2.0,1.0,2.0,,2.0,73.0,2.0,,,,,,2.0,47.0,3.0,1.0,2.0,2.0,2.0,,,,,1.0,2.0,0.0,0.0,0.0,,,2.0,2.0,1.0,1.0,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.013,0.074,0
4,5,72,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-06,2011-07-10,160.0,64.0,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,37.0,0.9,3.8,5.7,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,60.0,,21.0,40.0,1.0,40.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,34.6,,19.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,,1.0,1.0,0.0,0.0,0.0,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,0.8,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.016,0.019,0


### Categorical Variable Levels

In [441]:
print(raw_data['gender'].unique())
print(raw_data['racecaucasian'].unique())
print(raw_data['raceblack'].unique())
print(raw_data['raceasian'].unique())
print(raw_data['racenativeam'].unique())
print(raw_data['racnativepacific'].unique())
print(raw_data['raceother'].unique())
print(raw_data['ethnicity'].unique())

[  1.   2.  nan]
[  1.   2.  nan]
[  2.   1.  nan]
[  2.   1.  nan]
[  2.   1.  nan]
[  2.  nan   1.]
[  2.  nan   1.]
[  2.   1.  nan   3.]


#### Testing Replacement Code

- `df['col_name'].replace({replacement_dict})`
- replacement_dict `{old_value_1: new_value_1, old_value_2: new_value_2, np.nan: new_value_3}`

In [442]:
test_df = raw_data.copy()

In [443]:
test_df['ethnicity'] = test_df['ethnicity'].replace({1: 1, 2: 0, np.nan: 0})

In [444]:
# confirming replacement code worked
print(raw_data['ethnicity'].unique())
print(test_df['ethnicity'].unique())

[  2.   1.  nan   3.]
[ 0.  1.  3.]


In [445]:
# confirming that original data types retained after re-coding
print(raw_data['ethnicity'].dtype)
print(test_df['ethnicity'].dtype)

float64
float64


### More Categorical Levels

In [446]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [447]:
print(sorted(raw_data['diabetes'].unique()))
print(sorted(raw_data['diabctrl'].unique()))
print(sorted(raw_data['dyslip'].unique()))
print(sorted(raw_data['dialysis'].unique()))
print(sorted(raw_data['hypertn'].unique()))

[1.0, 2.0, nan, 3.0]
[1.0, nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
[1.0, 2.0, nan, 3.0]
[1.0, 2.0, nan, 3.0]
[1, 2, 3]


- `hypertn` had no `NaN`s

### `infendo` and `infendty`

In [448]:
print(sorted(raw_data['infendo'].unique()))
print(sorted(raw_data['infendty'].unique()))

[1.0, 2.0, nan]
[nan, 1.0, 2.0]


In [449]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [450]:
print(sorted(raw_data['TobaccoUse'].unique()))
print(sorted(raw_data['chrlungd'].unique()))
print(sorted(raw_data['hmo2'].unique()))
print(sorted(raw_data['slpapn'].unique()))
print(sorted(raw_data['ivdrugab'].unique()))
print(sorted(raw_data['alcohol'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
[1.0, 2.0, 3.0, 4.0, nan, 5.0, 6.0]
[1.0, 2.0, 3.0, 4.0, nan, 5.0]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, 4.0, 5.0, nan]
[1.0, 2.0, 3.0, nan, 4.0, 5.0]


- should we put the text in the `replacement_dict` where there there are a lot (>3) levels that will be converted to dummies such that column names will be human readable??

In [451]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [452]:
print(sorted(raw_data['liverdis'].unique()))
print(sorted(raw_data['immsupp'].unique()))
print(sorted(raw_data['mediastrad'].unique()))
print(sorted(raw_data['cancer'].unique()))
print(sorted(raw_data['pvd'].unique()))
print(sorted(raw_data['ThAoDisease'].unique()))
print(sorted(raw_data['syncope'].unique()))
print(sorted(raw_data['unrespstat'].unique()))
print(sorted(raw_data['cvd'].unique()))

[1.0, 2.0, nan, 3.0]
[1, 2, 3]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, nan, 3.0]
[nan, 1.0, 2.0, 3.0]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, nan]
[1.0, 2.0, 3.0, nan]


- `immsupp` has no `NaN`s

In [453]:
print(sorted(raw_data['cva'].unique()))
print(sorted(raw_data['cvawhen'].unique()))

[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0, 4.0]


In [454]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [455]:
print(sorted(raw_data['cvdtia'].unique()))
print(sorted(raw_data['cvdcarsten'].unique()))
print(sorted(raw_data['cvdstenrt'].unique()))
print(sorted(raw_data['cvdstenlft'].unique()))
print(sorted(raw_data['cvdpcarsurg'].unique()))

[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[nan, 1.0, 2.0]


In [456]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [457]:
print(sorted(raw_data['hitanti'].unique()))
print(sorted(raw_data['cigsmoker'].unique()))
print(sorted(raw_data['cigsmokercurr'].unique()))
print(sorted(raw_data['prcvint'].unique()))
print(sorted(raw_data['prcab'].unique()))
print(sorted(raw_data['prvalve'].unique()))

[1.0, 2.0, 3.0, nan]
[1.0, 2.0, nan]
[1.0, nan, 2.0]
[1.0, 2.0, nan, 3.0]
[nan, 1.0, 2.0]
[nan, 1.0, 2.0]


- testing to see if you can use a `replacement_dict` with keys `1, 2, 3, np.nan` for columns that are only `1,2, np.nan`

In [458]:
# `cigsmoker` only has 1=YES, 2=NO and NaN
test_df['cigsmoker'] = test_df['cigsmoker'].replace({1: 1, 2: 0, 3: 0, np.nan: 0})

In [459]:
# confirming replacement code worked
print(raw_data['cigsmoker'].unique())
print(test_df['cigsmoker'].unique())

[  1.   2.  nan]
[ 1.  0.]


In [460]:
# confirming that original data types retained after re-coding
print(raw_data['cigsmoker'].dtype)
print(test_df['cigsmoker'].dtype)

float64
float64


#### Good - can use the same `replacement_dict` for all `yes, no` and `yes, no, unknown` categoricals

In [461]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [462]:
print(sorted(raw_data['CardSympTimeOfAdm'].unique()))
print(sorted(raw_data['CardSympTimeOfSurg'].unique()))
print(sorted(raw_data['anginalclass'].unique()))
print(sorted(raw_data['chf'].unique()))
print(sorted(raw_data['classnyh'].unique()))
print(sorted(raw_data['priorhf'].unique()))
print(sorted(raw_data['carshock'].unique()))
print(sorted(raw_data['resusc'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
[1.0, 2.0, 3.0, 4.0, 5.0, nan]
[1.0, 2.0, nan, 3.0]
[nan, 1.0, 2.0, 3.0, 4.0]
[1.0, 2.0, nan, 3.0]
[1.0, 2.0, 3.0, 4.0, nan]
[1.0, 2.0, 3.0, 4.0, nan]


In [463]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [464]:
print(sorted(raw_data['Arrhythmia'].unique()))
print(sorted(raw_data['ArrhythAFlutter'].unique()))
print(sorted(raw_data['ArrhythAFib'].unique()))
print(sorted(raw_data['ArrhythAFibDur'].unique()))
print(sorted(raw_data['arrhythwhen'].unique()))
print(sorted(raw_data['arrhyafib'].unique()))
print(sorted(raw_data['arrhyafibty'].unique()))

[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0]
[nan, 1.0, 2.0, 3.0]
[1.0, 2.0, 3.0, nan]
[nan, 1.0, 2.0]
[nan, 1.0, 2.0]


In [465]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [466]:
print(sorted(raw_data['medasa'].unique()))
print(sorted(raw_data['medaplt5days'].unique()))
print(sorted(raw_data['medinotr'].unique()))
print(sorted(raw_data['medlipid'].unique()))
print(sorted(raw_data['numdisv'].unique()))
print(sorted(raw_data['hdefd'].unique()))

[1.0, 2.0, 3.0, 4.0, nan]
[1, 2, 3, 4]
[1.0, 2.0, nan]
[1.0, 2.0, nan, 3.0, 4.0]
[1.0, 2.0, 3.0, 4.0, nan]
[1.0, 2.0, nan]


In [467]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [468]:
print(sorted(raw_data['vdaort'].unique()))
print(sorted(raw_data['vdstena'].unique()))
print(sorted(raw_data['vdinsufm'].unique()))
print(sorted(raw_data['vdstenm'].unique()))
print(sorted(raw_data['vdinsuft'].unique()))
print(sorted(raw_data['incidenc'].unique()))
print(sorted(raw_data['status'].unique()))

[1.0, 2.0, nan]
[1.0, 2.0, nan]
[0.0, 1.0, 2.0, 3.0, 4.0, nan, 5.0]
[2.0, nan, 1.0]
[2.0, 3.0, nan, 0.0, 1.0, 4.0, 5.0]
[1.0, 2.0, 3.0, 4.0, 5.0, nan]
[1.0, 2.0, 3.0, nan, 4.0]


### `Outcome` Variables ????

In [469]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [470]:
sorted(raw_data.columns.values.tolist())

['ADEt1',
 'ADEt2',
 'ADEt3',
 'ADLesTAneur',
 'ADLesTCoarcNar',
 'ADLesTDis',
 'ADLesTDisTmg',
 'ADLesTDisTy',
 'ADLesTIntraHema',
 'ADLesTPenUlcer',
 'ADLesTPseudo',
 'ADLesTRup',
 'ADLocArch',
 'ADLocAsc',
 'ADLocDesThor',
 'ADLocRoot',
 'ADLocThora',
 'ADPres',
 'AoHemoDatAvail',
 'AortProcAsc',
 'AortProcDesProx',
 'AortProcHemi',
 'AortProcRoot',
 'AortProcTotArch',
 'ArrhythAFib',
 'ArrhythAFibDur',
 'ArrhythAFlutter',
 'ArrhythAtrFib',
 'ArrhythPPaced',
 'ArrhythVV',
 'Arrhythmia',
 'AsmtAoDxMeth',
 'BldRBC',
 'CABHybrPCI',
 'CAortReint',
 'CAortReintTy',
 'CNEnceph',
 'CNParesisTy',
 'CNStrokT',
 'COtLiver',
 'CReintMI',
 'CReintMIIntTy',
 'CReintMIVes',
 'CVaAoDisTy',
 'CanArtStInn',
 'CardSympTimeOfAdm',
 'CardSympTimeOfSurg',
 'CathBasAssistTy',
 'ChrLungDType',
 'CombCardPCI',
 'CombProcs',
 'CombProcsPCI',
 'CombProcsStatus',
 'CombProcsStentTy',
 'CompMAD',
 'CompMAD1',
 'CompMAD2',
 'CompMAD3',
 'DCFactorXa',
 'DCNovOrAnti',
 'DCOthAnticoag',
 'DCOthAntiplat',
 'DialSta

In [471]:
# when you sort a list of strings, python first alphabetizes strings that start with capital letters
# then goes to strings that start with lower case letters

print(sorted(raw_data['cnstrokp'].unique())) # no NaNs
print(sorted(raw_data['cnstrokttia'].unique())) # no NaNs

[1, 2, 3, 4, 5]
[1, 2]


### Cleaning and Recoding Functions Pseudocode

#### Recoding `2-3` Level Categoricals `(YES=1, NO=0, np.nan = 0)`

- create a list of columns you want to recode
- use to mask main dataframe - arguments to the function will be column list (`col_list`) and main dataset `dataframe`
- iterate through the list of columns and apply replacement dictionary
- for `column` in `col_list`
- `masked_dataframe[column] = masked_dataframe[column].replace({1: 1, 2: 0, np.nan=0})`
- `return masked_dataframe`

#### Recoding Multi-Level Categoricals ( > 3) and Binarizing
- more complicated, may need to do one at a time
- through the function, need to pass the column name, replacement dictionary(ies) and main dataframe
- have to deal with the `NaN`s -- or incorporate through the numeric codes to text step - directly
- for human readable column headings, need to convert numeric codes to text headings
- then apply `pd.get_dummies()` or use functions come up with last week
- how are you going to avoid the dummy variable trap `k-1`
- can use the `drop_first=True` parameter in `pd.get_dummies()` to get `k-1` dummies out of` k` categorical levels by removing the first level
- can you specify which column is the reference category??
- I think the best way is to run `pd.get_dummies()` without the `drop_first` parameter and then before you return the `dataframe` you specify the reference column to `drop`
- `k-1_dummies_df = full_dummies_df.drop(drop_col, axis=1)` where `drop_col` is a `string` passed as a parameter in your function

### `categorical_to_numeric` Function for reference

### Remember - need two datasets - `cleaned only` and `cleaned AND binarized`
- can keep the categorical levels - just clean it up for `decision trees` as it is better for them that features are not binarized

### Combined Categorical Variable List

#### Raw List

#### Edited

#### Creating `cat_features` List

In [472]:
cat_features = ['gender',
                'racecaucasian',
                'raceblack',
                'raceasian',
                'racenativeam',
                'racnativepacific',
                'ethnicity',
                'diabetes',
                'diabctrl',
                'dyslip',
                'dialysis',
                'hypertn',
                'infendo',
                'infendty',
                'TobaccoUse',
                'chrlungd',
                'hmo2',
                'slpapn',
                'ivdrugab',
                'alcohol',
                'liverdis',
                'immsupp',
                'mediastrad',
                'cancer',
                'pvd',
                'ThAoDisease',
                'syncope',
                'unrespstat',
                'cvd',
                'cva',
                'cvawhen',
                'cvdtia',
                'cvdcarsten',
                'cvdstenrt',
                'cvdstenlft',
                'cvdpcarsurg',
                'hitanti',
                'cigsmoker',
                'cigsmokercurr',
                'prcvint',
                'prcab',
                'prvalve',
                'CardSympTimeOfAdm',
                'CardSympTimeOfSurg',
                'anginalclass',
                'chf',
                'classnyh',
                'priorhf',
                'carshock',
                'resusc',
                'Arrhythmia',
                'ArrhythAFlutter',
                'ArrhythAFib',
                'ArrhythAFibDur',
                'arrhythwhen',
                'arrhyafib',
                'medasa',
                'medaplt5days',
                'medinotr',
                'medlipid',
                'numdisv',
                'hdefd',
                'vdaort',
                'vdstena',
                'vdinsufm',
                'vdstenm',
                'vdinsuft',
                'incidenc',
                'status']

In [473]:
len(cat_features)

69

#### `yes_no_unc` Feature List

In [474]:
yes_no_unc = ['gender',
              'racecaucasian',
              'raceblack',
              'raceasian',
              'racenativeam',
              'racnativepacific',
              'ethnicity',
              'diabetes',
              'dyslip',
              'dialysis',
              'hypertn',
              'infendo',
              'slpapn',
              'liverdis',
              'immsupp',
              'mediastrad',
              'cancer',
              'pvd',
              'ThAoDisease',
              'syncope',
              'unrespstat',
              'cvd',
              'cva',
              'cvdtia',
              'cvdpcarsurg',
              'hitanti',
              'cigsmoker',
              'cigsmokercurr',
              'prcvint',
              'prcab',
              'prvalve',
              'chf',
              'priorhf',
              'Arrhythmia',
              'arrhyafib',
              'medinotr',
              'hdefd',
              'vdaort',
              'vdstena',
              'vdstenm']

In [475]:
len(yes_no_unc)

40

#### `compress_to_two` Feature List

In [476]:
compress_to_two = ['diabctrl',
                   'infendty',
                   'TobaccoUse',
                   'chrlungd',
                   'hmo2',
                   'ivdrugab',
                   'alcohol',
                   'cvawhen',
                   'carshock', # rename`carshock24`
                   'resusc',   # rename `reusc24`
                   'medasa',
                   'medaplt5days',
                   'medlipid',
                   'numdisv']

In [477]:
len(compress_to_two)

14

#### `recode_D` Feature List - Will Be Recoding and Creating Dummies w/Reference Class - No Parent Variable

In [478]:
recode_D = ['CardSympTimeOfAdm',
            'CardSympTimeOfSurg',
            'anginalclass',
            'classnyh',
            'vdinsufm',
            'vdinsuft',
            'incidenc', # rename to 'incidenc_REOP'
            'status']

In [479]:
len(recode_D)

8

#### `recode_D_P` Feature List - Will Be Recoding and Creating Dummies w/Reference Class and w/o Reference Class Due to Parent-Child Relationship

In [480]:
recode_D_P = ['cvdcarsten',      # parent is `cvd`
              'cvdstenrt',       # parent is 'cvd'
              'cvdstenlft',      # parent is 'cvd'
              'ArrhythAFlutter', # parent is 'Arrhythmia'
              'ArrhythAFib',     # parent is 'Arrhythmia'
              'ArrhythAFibDur',  # parent is 'Arrhythmia'
              'arrhythwhen']     # parent is 'Arrhythmia'
                

In [481]:
len(recode_D_P)

7

In [482]:
40 + 14 + 8 + 7

69

#### Numerical Features

In [483]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [722]:
numerical_features = ['recordId', # need to keep even though will not be included in final matrix
                      'age',
                      'heightcm',
                      'weightkg',
                      'hct',
                      'creatlst',
                      'totalbumin',
                      'a1clvl',
                      'meldscr',
                      'hdef',
                      'pasys']

In [723]:
original_numerical_features = numerical_features

In [485]:
len(numerical_features)

11

#### `datetime` Features

In [486]:
date_features = ['surgdt']#,
                # 'dischdt']

In [487]:
len(date_features)

1

In [488]:
# Ties to excel sheet -- all features accounted for
69 + 10 + 1 # use 10 instead of 11 because of `recordId`

80

### Creating `pre_op_features`

In [489]:
pre_op_features = numerical_features + date_features + cat_features

In [490]:
pre_op_features

['recordId',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'surgdt',
 'gender',
 'racecaucasian',
 'raceblack',
 'raceasian',
 'racenativeam',
 'racnativepacific',
 'ethnicity',
 'diabetes',
 'diabctrl',
 'dyslip',
 'dialysis',
 'hypertn',
 'infendo',
 'infendty',
 'TobaccoUse',
 'chrlungd',
 'hmo2',
 'slpapn',
 'ivdrugab',
 'alcohol',
 'liverdis',
 'immsupp',
 'mediastrad',
 'cancer',
 'pvd',
 'ThAoDisease',
 'syncope',
 'unrespstat',
 'cvd',
 'cva',
 'cvawhen',
 'cvdtia',
 'cvdcarsten',
 'cvdstenrt',
 'cvdstenlft',
 'cvdpcarsurg',
 'hitanti',
 'cigsmoker',
 'cigsmokercurr',
 'prcvint',
 'prcab',
 'prvalve',
 'CardSympTimeOfAdm',
 'CardSympTimeOfSurg',
 'anginalclass',
 'chf',
 'classnyh',
 'priorhf',
 'carshock',
 'resusc',
 'Arrhythmia',
 'ArrhythAFlutter',
 'ArrhythAFib',
 'ArrhythAFibDur',
 'arrhythwhen',
 'arrhyafib',
 'medasa',
 'medaplt5days',
 'medinotr',
 'medlipid',
 'numdisv',
 'hdefd',
 'vdaort',
 'vdstena'

In [491]:
len(pre_op_features)

81

### Plan for Saturday
- start a new file - save as
- mask `raw_data.copy()` by `pre_op_features`
- recode features - new levels, replacing `NaN`s
- before create `dummies` - recode to new numeric levels for `decision tree` models
- create `dummies`
- rename columns
- put together `feature matrix`
- split by dates in terms of the databases

## Saturday - October 5, 2019

#### Step 1. Delete records where `age` and `gender` are `NaN`

In [492]:
raw_data.head(1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [493]:
raw_data.shape

(42746, 409)

#### Checking Key Variables for `NaN`s

In [494]:
print (raw_data['age'].isnull().sum())
print (raw_data['gender'].isnull().sum())
print (raw_data['surgdt'].isnull().sum())
#print (raw_data['dischdt'].isnull().sum())
print (raw_data['heightcm'].isnull().sum())
print (raw_data['weightkg'].isnull().sum())

0
3
0
2
3


- going to delete rows where there are `NaN`s in these key features - `gender`, `heightcm` and `weightkg`

In [495]:
raw_data[raw_data['gender'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
29564,29565,71,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-12-28,2016-01-16,185.0,82.0,2.0,,2.0,2.0,2,2.0,,5.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,14.1,40.9,1.02,4.0,4.8,3.0,1.1,7.66,,,,,,1.0,2.0,2.0,,,,,1.0,20.0,1.0,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,,2.0,2.0,,1.0,,2.0,,1.0,45.0,1.0,34.0,52.0,1.0,46.0,2.0,,,,,,,,,,,,4.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2015-12-28,2015-12-28,,,,,34.0,3.0,31.0,199.0,3,,,2.0,1.0,2.0,2.0,2.0,337.0,2.0,,,,,,2.0,312.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,3.0,2.0,2.0,1.0,261.0,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,1.0,2.0,,2.0,140.0,1.2,,,2.0,,1.0,1.0,2.0,1.0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,1.0,2016-01-21,999.0,,,,,,,0
29703,29704,68,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-02-19,2016-02-23,177.0,94.0,2.0,,2.0,2.0,2,2.0,,5.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,3.0,,3.0,2.0,14.2,41.5,0.91,3.7,5.5,3.0,1.0,6.4,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,2.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,2.0,2.0,1.0,,2.0,,1.0,60.0,1.0,32.0,45.0,2.0,,1.0,1.0,1.0,0.4,61.0,5.0,2.0,,,,,,1.0,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2016-02-19,2016-02-19,,,,,32.0,3.0,29.0,228.0,3,,,2.0,1.0,2.0,2.0,2.0,134.0,2.0,,,,,,2.0,109.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,132.0,1.0,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0
29991,29992,54,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-06-24,2016-06-28,182.8,75.1,2.0,,1.0,2.0,1,2.0,,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,3.0,,2.0,16.1,45.2,1.06,3.6,5.7,3.0,1.0,6.96,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,1,2.0,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,,2.0,,1.0,55.0,1.0,32.0,43.0,1.0,27.8,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,2.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-06-24,2016-06-24,,,,,35.0,3.0,29.0,151.0,3,,,2.0,1.0,2.0,2.0,2.0,113.0,2.0,,,,,,2.0,91.0,2.0,1.0,2.0,2.0,3.0,,,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,148.0,1.1,,,2.0,,2.0,2.0,2.0,1.0,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0


In [496]:
raw_data[raw_data['gender'].notnull()].shape

(42743, 409)

In [497]:
raw_data[raw_data['heightcm'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
22182,22183,65,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-04-06,2015-04-08,,,1.0,7.0,3.0,2.0,3,2.0,,6.0,6.0,,5.0,3.0,3.0,3.0,5.0,3.0,3,3.0,3.0,3.0,3.0,3.0,1.0,3.0,,,,,,,,,,,,,3.0,,,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,3.0,,3.0,3.0,4.0,1.0,2.0,3.0,1.0,1.0,,,,,,2.0,2.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,1.0,2.0,2.0,4.0,70.0,2.0,,2.0,,2.0,,,2.0,,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2015-04-06,2015-04-07,,,,,33.7,3.0,22.0,509.0,3,,,2.0,1.0,2.0,2.0,2.0,78.0,2.0,,,,,,2.0,63.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,1.0,4.0,4.0,1.0,0.0,4.0,,1.0,2.0,1.0,1.0,2.0,,,1.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2015-04-08,1.0,1.0,2,1.0,2.0,,,,,,,,,192.0,2.5,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,,NaT,,,,,,0.031,0.107,0
29600,29601,63,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-01-15,2016-01-21,,,1.0,4.0,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,4.0,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,14.4,43.7,1.45,4.0,8.1,3.0,1.0,9.97,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,3.0,3.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,20.0,2.0,,1.0,55.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,3,2,1.0,2.0,2.0,2016-01-15,2016-01-15,,,,,34.0,3.0,29.0,258.0,3,,,2.0,1.0,2.0,2.0,2.0,162.0,2.0,,,,,,2.0,123.0,4.0,1.0,1.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,198.0,1.6,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,1.0,2016-02-01,998.0,,,,,0.026,0.086,0


In [498]:
raw_data[raw_data['weightkg'].isnull()]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
8710,8711,62,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-14,2016-12-22,170.2,,2.0,,1.0,2.0,1,2.0,,5.0,1.0,,2.0,2.0,2.0,5.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,21.1,35.8,1.16,4.7,,3.0,1.08,8.69,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,1.0,,2.0,,1.0,57.5,1.0,40.1,55.0,1.0,37.0,1.0,1.0,1.0,0.5,48.0,5.0,2.0,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,1.0,2.0,2016-12-14,2016-12-14,,,,,36.0,3.0,24.0,147.0,3,,,2.0,1.0,2.0,2.0,2.0,100.0,2.0,,,,,,2.0,68.0,4.0,1.0,2.0,2.0,1.0,,1.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,2.0,2.0,2.0,,,,1.0,2.0,2.0,2.0,,,,,,,,,,,,,1.0,1.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,1.0,,2.0,108.0,1.1,,,2.0,,1.0,1.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,2.0,NaT,,,,,,0.014,0.025,0
22182,22183,65,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-04-06,2015-04-08,,,1.0,7.0,3.0,2.0,3,2.0,,6.0,6.0,,5.0,3.0,3.0,3.0,5.0,3.0,3,3.0,3.0,3.0,3.0,3.0,1.0,3.0,,,,,,,,,,,,,3.0,,,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,3.0,,3.0,3.0,4.0,1.0,2.0,3.0,1.0,1.0,,,,,,2.0,2.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,1.0,2.0,2.0,4.0,70.0,2.0,,2.0,,2.0,,,2.0,,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2015-04-06,2015-04-07,,,,,33.7,3.0,22.0,509.0,3,,,2.0,1.0,2.0,2.0,2.0,78.0,2.0,,,,,,2.0,63.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,1.0,4.0,4.0,1.0,0.0,4.0,,1.0,2.0,1.0,1.0,2.0,,,1.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2015-04-08,1.0,1.0,2,1.0,2.0,,,,,,,,,192.0,2.5,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,,NaT,,,,,,0.031,0.107,0
29600,29601,63,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-01-15,2016-01-21,,,1.0,4.0,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,4.0,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,14.4,43.7,1.45,4.0,8.1,3.0,1.0,9.97,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,3.0,3.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,20.0,2.0,,1.0,55.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,3,2,1.0,2.0,2.0,2016-01-15,2016-01-15,,,,,34.0,3.0,29.0,258.0,3,,,2.0,1.0,2.0,2.0,2.0,162.0,2.0,,,,,,2.0,123.0,4.0,1.0,1.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,198.0,1.6,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,1.0,2016-02-01,998.0,,,,,0.026,0.086,0


- between the key features with `NaN`s - we will be deleting `6` rows

In [499]:
raw_data[(raw_data['gender'].isnull()) | 
         (raw_data['heightcm'].isnull()) | 
         (raw_data['weightkg'].isnull())]

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
8710,8711,62,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-14,2016-12-22,170.2,,2.0,,1.0,2.0,1,2.0,,5.0,1.0,,2.0,2.0,2.0,5.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,21.1,35.8,1.16,4.7,,3.0,1.08,8.69,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,1.0,,2.0,,1.0,57.5,1.0,40.1,55.0,1.0,37.0,1.0,1.0,1.0,0.5,48.0,5.0,2.0,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,1.0,2.0,2016-12-14,2016-12-14,,,,,36.0,3.0,24.0,147.0,3,,,2.0,1.0,2.0,2.0,2.0,100.0,2.0,,,,,,2.0,68.0,4.0,1.0,2.0,2.0,1.0,,1.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,2.0,2.0,2.0,,,,1.0,2.0,2.0,2.0,,,,,,,,,,,,,1.0,1.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,1.0,,2.0,108.0,1.1,,,2.0,,1.0,1.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,2.0,NaT,,,,,,0.014,0.025,0
22182,22183,65,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-04-06,2015-04-08,,,1.0,7.0,3.0,2.0,3,2.0,,6.0,6.0,,5.0,3.0,3.0,3.0,5.0,3.0,3,3.0,3.0,3.0,3.0,3.0,1.0,3.0,,,,,,,,,,,,,3.0,,,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,3.0,,3.0,3.0,4.0,1.0,2.0,3.0,1.0,1.0,,,,,,2.0,2.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,1.0,2.0,2.0,4.0,70.0,2.0,,2.0,,2.0,,,2.0,,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2015-04-06,2015-04-07,,,,,33.7,3.0,22.0,509.0,3,,,2.0,1.0,2.0,2.0,2.0,78.0,2.0,,,,,,2.0,63.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,1.0,4.0,4.0,1.0,0.0,4.0,,1.0,2.0,1.0,1.0,2.0,,,1.0,1.0,2.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2015-04-08,1.0,1.0,2,1.0,2.0,,,,,,,,,192.0,2.5,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,,NaT,,,,,,0.031,0.107,0
29564,29565,71,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2015-12-28,2016-01-16,185.0,82.0,2.0,,2.0,2.0,2,2.0,,5.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,14.1,40.9,1.02,4.0,4.8,3.0,1.1,7.66,,,,,,1.0,2.0,2.0,,,,,1.0,20.0,1.0,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,,2.0,2.0,,1.0,,2.0,,1.0,45.0,1.0,34.0,52.0,1.0,46.0,2.0,,,,,,,,,,,,4.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2015-12-28,2015-12-28,,,,,34.0,3.0,31.0,199.0,3,,,2.0,1.0,2.0,2.0,2.0,337.0,2.0,,,,,,2.0,312.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,3.0,2.0,2.0,1.0,261.0,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,1.0,2.0,,2.0,140.0,1.2,,,2.0,,1.0,1.0,2.0,1.0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1.0,1.0,2016-01-21,999.0,,,,,,,0
29600,29601,63,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-01-15,2016-01-21,,,1.0,4.0,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,4.0,2.0,1,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,,,2.0,14.4,43.7,1.45,4.0,8.1,3.0,1.0,9.97,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,3.0,3.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,20.0,2.0,,1.0,55.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,3,2,1.0,2.0,2.0,2016-01-15,2016-01-15,,,,,34.0,3.0,29.0,258.0,3,,,2.0,1.0,2.0,2.0,2.0,162.0,2.0,,,,,,2.0,123.0,4.0,1.0,1.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,198.0,1.6,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,1.0,2016-02-01,998.0,,,,,0.026,0.086,0
29703,29704,68,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-02-19,2016-02-23,177.0,94.0,2.0,,2.0,2.0,2,2.0,,5.0,3.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,3.0,,3.0,2.0,14.2,41.5,0.91,3.7,5.5,3.0,1.0,6.4,,,,,,2.0,,,,,,,,,,,,,,,7.0,7.0,1.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,2.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,2.0,2.0,1.0,,2.0,,1.0,60.0,1.0,32.0,45.0,2.0,,1.0,1.0,1.0,0.4,61.0,5.0,2.0,,,,,,1.0,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,2,1,1.0,2.0,2.0,2016-02-19,2016-02-19,,,,,32.0,3.0,29.0,228.0,3,,,2.0,1.0,2.0,2.0,2.0,134.0,2.0,,,,,,2.0,109.0,4.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,1.0,2.0,,,,,,,,,2.0,,,,,2.0,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,132.0,1.0,,,2.0,,2.0,2.0,2.0,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0
29991,29992,54,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-06-24,2016-06-28,182.8,75.1,2.0,,1.0,2.0,1,2.0,,2.0,1.0,,2.0,2.0,2.0,3.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,3.0,,2.0,16.1,45.2,1.06,3.6,5.7,3.0,1.0,6.96,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,5.0,5.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,1,2.0,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,,2.0,,1.0,55.0,1.0,32.0,43.0,1.0,27.8,2.0,,,,,,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,2.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-06-24,2016-06-24,,,,,35.0,3.0,29.0,151.0,3,,,2.0,1.0,2.0,2.0,2.0,113.0,2.0,,,,,,2.0,91.0,2.0,1.0,2.0,2.0,3.0,,,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,2.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,148.0,1.1,,,2.0,,2.0,2.0,2.0,1.0,1,1,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,2.0,NaT,,,,,,,,0


#### Implementing Row Deletion

In [500]:
working_data = raw_data.copy()[(raw_data['gender'].notnull()) & 
                               (raw_data['heightcm'].notnull()) & 
                               (raw_data['weightkg'].notnull())]

- confirming, deletion performed correctly

In [501]:
print (working_data.shape)
print (working_data['gender'].isnull().sum())
print (working_data['heightcm'].isnull().sum())
print (working_data['weightkg'].isnull().sum())

(42740, 409)
0
0
0


In [502]:
print (working_data['age'].isnull().sum())
print (working_data['surgdt'].isnull().sum())
print (working_data['dischdt'].isnull().sum())

0
0
0


#### Exporting `working_data` to `Excel` - TAKES TOO LONG - NOT SEEM TO BE WORKING

In [73]:
# working_data.to_excel("capstone_data_key_variable_nulls_cleaned.xlsx")

#### Saving `working_data` as a `.pkl` File
- `pd.read_pickle("Filename.pkl")`

In [75]:
# working_data.to_pickle("capstone_data_key_variable_nulls_cleaned.pkl")

### Step 2. Cleaning and Recoding Variables
- selecting `pre-op` features from main dataset

In [503]:
working_data.head(1) 

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0


In [504]:
working_data.shape

(42740, 409)

In [505]:
pre_op_df = working_data.copy()[pre_op_features]

In [506]:
pre_op_df.head()

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,1.0,1.0,2.0,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,1.0,1.0
1,2,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,2011-07-02,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,2.0,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,2,2.0,2.0,4.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,3.0
2,3,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011-07-04,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,3.0,2.0,,1.0,2.0,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,1,1.0,1.0,4.0,1.0,1.0,1.0,3.0,2.0,3.0,1.0,2.0
3,4,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011-07-05,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,2.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,1.0,,2.0,2,2.0,1.0,2.0,1.0,2.0,,4.0,2.0,2.0,1.0,1.0
4,5,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011-07-06,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,2.0,,,,,1.0,2.0


In [507]:
pre_op_df.shape

(42740, 81)

- making a copy of `pre_op_df` for check if recoding was done correctly

In [508]:
orig_pre = pre_op_df.copy()

In [509]:
orig_pre.shape

(42740, 81)

### Recoding `Y/N/U` Features

- `yes_no_unc` Feature List

In [510]:
yes_no_unc[0:5]

['gender', 'racecaucasian', 'raceblack', 'raceasian', 'racenativeam']

- going to iterate through the list to recode the columns

In [511]:
for column in yes_no_unc:
    pre_op_df[column] = pre_op_df[column].replace({1: 1, 2: 0, 3: 0, np.nan: 0})

- there should be no `NaN`s

In [512]:
pre_op_df[yes_no_unc].isnull().sum()

gender              0
racecaucasian       0
raceblack           0
raceasian           0
racenativeam        0
racnativepacific    0
ethnicity           0
diabetes            0
dyslip              0
dialysis            0
hypertn             0
infendo             0
slpapn              0
liverdis            0
immsupp             0
mediastrad          0
cancer              0
pvd                 0
ThAoDisease         0
syncope             0
unrespstat          0
cvd                 0
cva                 0
cvdtia              0
cvdpcarsurg         0
hitanti             0
cigsmoker           0
cigsmokercurr       0
prcvint             0
prcab               0
prvalve             0
chf                 0
priorhf             0
Arrhythmia          0
arrhyafib           0
medinotr            0
hdefd               0
vdaort              0
vdstena             0
vdstenm             0
dtype: int64

- checking recoding against original in `orig_pre`

In [513]:
feature_name = []
orig_coding = []
new_coding = []

for column in yes_no_unc:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_df[column].unique()))

In [514]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,gender,"[1.0, 2.0]","[0.0, 1.0]"
1,racecaucasian,"[1.0, 2.0, nan]","[0.0, 1.0]"
2,raceblack,"[1.0, 2.0, nan]","[0.0, 1.0]"
3,raceasian,"[1.0, 2.0, nan]","[0.0, 1.0]"
4,racenativeam,"[1.0, 2.0, nan]","[0.0, 1.0]"
5,racnativepacific,"[2.0, nan, 1.0]","[0.0, 1.0]"
6,ethnicity,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"
7,diabetes,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"
8,dyslip,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"
9,dialysis,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]"


In [515]:
len(yes_no_unc)

40

### Recoding `compress_to_two` Features

In [516]:
compress_to_two

['diabctrl',
 'infendty',
 'TobaccoUse',
 'chrlungd',
 'hmo2',
 'ivdrugab',
 'alcohol',
 'cvawhen',
 'carshock',
 'resusc',
 'medasa',
 'medaplt5days',
 'medlipid',
 'numdisv']

- creating `list` of `replacement_dicts`

In [517]:
replacement_dicts = [{1: 0, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0, 7: 0, np.nan: 0}, #diabctrl
                     {1: 0, 2: 1, 3: 0, np.nan: 0}, #infendty
                     {1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, np.nan: 0}, #TobaccoUse
                     {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, 6: 0, np.nan: 0}, #chrlungd
                     {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, np.nan: 0}, #hmo2
                     {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, np.nan: 0}, #ivdrugab
                     {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, np.nan: 0}, #alcohol
                     {1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}, #cvawhen
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #carshock -- RENAME to `carshock24`
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #resusc -- RENAME to `resusc24`
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medasa
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medaplt5days
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medlipid
                     {1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}] #numdisv

In [518]:
print (len(compress_to_two))
print (len(replacement_dicts))

14
14


- since we need to rename columns, before recoding, want to keep orignal coding for auditing purposes

In [519]:
pre_op_df.shape

(42740, 81)

In [520]:
pre_op_df['carshock_orig'] = pre_op_df['carshock']
pre_op_df['resusc_orig'] = pre_op_df['resusc']

In [521]:
# added two columns
pre_op_df.shape

(42740, 83)

- now recoding the features in `compress_to_two`

In [522]:
name_replacement_zip = list(zip(compress_to_two, replacement_dicts))

In [523]:
name_replacement_zip

[('diabctrl', {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0, 7: 0, nan: 0}),
 ('infendty', {1: 0, 2: 1, 3: 0, nan: 0}),
 ('TobaccoUse', {1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, nan: 0}),
 ('chrlungd', {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, 6: 0, nan: 0}),
 ('hmo2', {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, nan: 0}),
 ('ivdrugab', {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, nan: 0}),
 ('alcohol', {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, nan: 0}),
 ('cvawhen', {1: 0, 2: 1, 3: 1, 4: 1, nan: 0}),
 ('carshock', {1: 0, 2: 0, 3: 1, 4: 1, nan: 0}),
 ('resusc', {1: 0, 2: 0, 3: 1, 4: 1, nan: 0}),
 ('medasa', {1: 1, 2: 0, 3: 0, 4: 0, nan: 0}),
 ('medaplt5days', {1: 1, 2: 0, 3: 0, 4: 0, nan: 0}),
 ('medlipid', {1: 1, 2: 0, 3: 0, 4: 0, nan: 0}),
 ('numdisv', {1: 0, 2: 1, 3: 1, 4: 1, nan: 0})]

- iterate through `name_replacement_zip` and apply `replacement_dicts` to features in `compress_to_two`

In [524]:
for column, dictionary in name_replacement_zip:
    pre_op_df[column] = pre_op_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [525]:
feature_name = []
orig_coding = []
new_coding = []

for column in compress_to_two:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_df[column].unique()))

In [526]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,diabctrl,"[1.0, nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[0.0, 1.0]"
1,infendty,"[nan, 1.0, 2.0]","[0.0, 1.0]"
2,TobaccoUse,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]","[0.0, 1.0]"
3,chrlungd,"[1.0, 2.0, 3.0, 4.0, nan, 5.0, 6.0]","[0.0, 1.0]"
4,hmo2,"[1.0, 2.0, 3.0, 4.0, nan, 5.0]","[0.0, 1.0]"
5,ivdrugab,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[0.0, 1.0]"
6,alcohol,"[1.0, 2.0, 3.0, nan, 4.0, 5.0]","[0.0, 1.0]"
7,cvawhen,"[nan, 1.0, 2.0, 3.0, 4.0]","[0.0, 1.0]"
8,carshock,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]"
9,resusc,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]"


### Recoding `recode_D` Features - Will Need to Specify a Reference Class when Create Dummies

In [527]:
recode_D

['CardSympTimeOfAdm',
 'CardSympTimeOfSurg',
 'anginalclass',
 'classnyh',
 'vdinsufm',
 'vdinsuft',
 'incidenc',
 'status']

In [528]:
replacement_dicts_alpha = [{1: 'NONE', 
                            2: 'ANGINA', 
                            3: 'ANGINA', 
                            4: 'STEMI',
                            5: 'STEMI', 
                            6: 'NONE', 
                            7: 'NONE', 
                            np.nan: 'NONE'}, #CardSympTimeOfAdm
                           
                           {1: 'NONE', 
                            2: 'ANGINA', 
                            3: 'ANGINA', 
                            4: 'STEMI',
                            5: 'STEMI', 
                            6: 'NONE', 
                            7: 'NONE', 
                            np.nan: 'NONE'}, #CardSympTimeOfSurg
                           
                           {1: 'NONE', 
                            2: 'SLIGHT', 
                            3: 'SLIGHT', 
                            4: 'REST',
                            5: 'REST', 
                            np.nan: 'NONE'}, #anginalclass
                           
                           {1: 'NONE', 
                            2: 'SLIGHT', 
                            3: 'SLIGHT', 
                            4: 'REST',
                            np.nan: 'NONE'}, #classnyh
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsufm
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsuft
                           
                           {1: 'NONE', 
                            2: 'FIRST', 
                            3: 'SECOND', 
                            4: 'THIRD',
                            5: 'FOURTH', 
                            np.nan: 'NONE'}, #incidenc -- NEED TO RENAME incidence_REOP
                           
                           {1: 'NONE', 
                            2: 'URGENT', 
                            3: 'EMERGENCY', 
                            4: 'SALVAGE',
                            np.nan: 'NONE'}] #status

- don't need to create a numeric analog to the `dictionary` above
- if you use `H2O`'s `Decision Trees` versus `Scikit-learn`, you can run categorical features with text levels without having to convert them to numeric
- use the `include_c=True` parameter per the article `Are Categorical Variables Getting Lost in Your Random Forests`
<p>&nbsp;</p>
- going to work on a subset of `pre_op_df`

In [529]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


In [530]:
pre_op_df.shape

(42740, 83)

In [531]:
recode_D_df = pre_op_df.copy()[recode_D]

In [532]:
recode_D_df.head()

Unnamed: 0,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidenc,status
0,,,4.0,,4.0,2.0,1.0,1.0
1,,,5.0,,3.0,2.0,1.0,3.0
2,,,1.0,,3.0,3.0,1.0,2.0
3,,,1.0,,4.0,2.0,1.0,1.0
4,,,5.0,,,,1.0,2.0


In [533]:
recode_D_df.shape

(42740, 8)

In [534]:
name_replacement_zip = list(zip(recode_D, replacement_dicts_alpha))

In [535]:
name_replacement_zip

[('CardSympTimeOfAdm',
  {1: 'NONE',
   2: 'ANGINA',
   3: 'ANGINA',
   4: 'STEMI',
   5: 'STEMI',
   6: 'NONE',
   7: 'NONE',
   nan: 'NONE'}),
 ('CardSympTimeOfSurg',
  {1: 'NONE',
   2: 'ANGINA',
   3: 'ANGINA',
   4: 'STEMI',
   5: 'STEMI',
   6: 'NONE',
   7: 'NONE',
   nan: 'NONE'}),
 ('anginalclass',
  {1: 'NONE', 2: 'SLIGHT', 3: 'SLIGHT', 4: 'REST', 5: 'REST', nan: 'NONE'}),
 ('classnyh', {1: 'NONE', 2: 'SLIGHT', 3: 'SLIGHT', 4: 'REST', nan: 'NONE'}),
 ('vdinsufm',
  {0: 'NONE',
   1: 'TRIVIAL',
   2: 'MILD',
   3: 'MODERATE',
   4: 'SEVERE',
   5: 'NONE',
   nan: 'NONE'}),
 ('vdinsuft',
  {0: 'NONE',
   1: 'TRIVIAL',
   2: 'MILD',
   3: 'MODERATE',
   4: 'SEVERE',
   5: 'NONE',
   nan: 'NONE'}),
 ('incidenc',
  {1: 'NONE', 2: 'FIRST', 3: 'SECOND', 4: 'THIRD', 5: 'FOURTH', nan: 'NONE'}),
 ('status',
  {1: 'NONE', 2: 'URGENT', 3: 'EMERGENCY', 4: 'SALVAGE', nan: 'NONE'})]

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D`

In [536]:
for column, dictionary in name_replacement_zip:
    recode_D_df[column] = recode_D_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [537]:
feature_name = []
orig_coding = []
new_coding = []

for column in recode_D:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(recode_D_df[column].unique()))

In [538]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,CardSympTimeOfAdm,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[ANGINA, NONE, STEMI]"
1,CardSympTimeOfSurg,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[ANGINA, NONE, STEMI]"
2,anginalclass,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[NONE, REST, SLIGHT]"
3,classnyh,"[nan, 1.0, 2.0, 3.0, 4.0]","[NONE, REST, SLIGHT]"
4,vdinsufm,"[0.0, 1.0, 2.0, 3.0, 4.0, nan, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]"
5,vdinsuft,"[2.0, 3.0, nan, 0.0, 1.0, 4.0, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]"
6,incidenc,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[FIRST, FOURTH, NONE, SECOND, THIRD]"
7,status,"[1.0, 2.0, 3.0, nan, 4.0]","[EMERGENCY, NONE, SALVAGE, URGENT]"


#### Creating Dummy Variables from Recoded Features
- going to use a copy to keep the recoded features for `Decision Trees`
- recoded features: `recode_D_df`
- recoded features with `dummy variables`: `recode_D_Dummies`

In [539]:
recode_D_df.head()

Unnamed: 0,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidenc,status
0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE
1,NONE,NONE,REST,NONE,MODERATE,MILD,NONE,EMERGENCY
2,NONE,NONE,NONE,NONE,MODERATE,MODERATE,NONE,URGENT
3,NONE,NONE,NONE,NONE,SEVERE,MILD,NONE,NONE
4,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT


In [540]:
print (len(recode_D))
print (recode_D_df.shape)
print (pre_op_df.shape)

8
(42740, 8)
(42740, 83)


- renaming `incidenc` to `incidencREOP` for the `recode_D_df` for use in `Decision Trees`

In [541]:
recode_D_df = recode_D_df.rename(columns={'incidenc': 'incidencREOP'})

In [542]:
recode_D_df.head(1)

Unnamed: 0,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE


#### Now creating `dummies`

- applying `pd.get_dummies()`

In [543]:
recode_D_Dummies = pd.get_dummies(recode_D_df.copy())

In [544]:
recode_D_Dummies.head()

Unnamed: 0,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_NONE,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_NONE,CardSympTimeOfSurg_STEMI,anginalclass_NONE,anginalclass_REST,anginalclass_SLIGHT,classnyh_NONE,classnyh_REST,classnyh_SLIGHT,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_NONE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_NONE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_NONE,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_NONE,status_SALVAGE,status_URGENT
0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
3,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


In [545]:
print (recode_D_df.shape)
print (recode_D_Dummies.shape)

(42740, 8)
(42740, 31)


- now need to eliminate reference classes

In [546]:
# identifying nan columns to drop
drop_cols = [col for col in recode_D_Dummies.columns if col.endswith('_NONE')]

In [547]:
drop_cols

['CardSympTimeOfAdm_NONE',
 'CardSympTimeOfSurg_NONE',
 'anginalclass_NONE',
 'classnyh_NONE',
 'vdinsufm_NONE',
 'vdinsuft_NONE',
 'incidencREOP_NONE',
 'status_NONE']

In [548]:
len(drop_cols)

8

- dropping the columns

In [549]:
recode_D_Dummies = recode_D_Dummies.drop(drop_cols, axis=1)

In [550]:
recode_D_Dummies.head()

Unnamed: 0,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_REST,anginalclass_SLIGHT,classnyh_REST,classnyh_SLIGHT,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT
0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [551]:
recode_D_Dummies.shape

(42740, 23)

- reordering the columns for readability

In [552]:
list(recode_D_Dummies.columns.values)

['CardSympTimeOfAdm_ANGINA',
 'CardSympTimeOfAdm_STEMI',
 'CardSympTimeOfSurg_ANGINA',
 'CardSympTimeOfSurg_STEMI',
 'anginalclass_REST',
 'anginalclass_SLIGHT',
 'classnyh_REST',
 'classnyh_SLIGHT',
 'vdinsufm_MILD',
 'vdinsufm_MODERATE',
 'vdinsufm_SEVERE',
 'vdinsufm_TRIVIAL',
 'vdinsuft_MILD',
 'vdinsuft_MODERATE',
 'vdinsuft_SEVERE',
 'vdinsuft_TRIVIAL',
 'incidencREOP_FIRST',
 'incidencREOP_FOURTH',
 'incidencREOP_SECOND',
 'incidencREOP_THIRD',
 'status_EMERGENCY',
 'status_SALVAGE',
 'status_URGENT']

In [553]:
new_col_order = ['CardSympTimeOfAdm_ANGINA',
                 'CardSympTimeOfAdm_STEMI',
                 
                 'CardSympTimeOfSurg_ANGINA',
                 'CardSympTimeOfSurg_STEMI',

                 'anginalclass_SLIGHT',
                 'anginalclass_REST',

                 'classnyh_SLIGHT',
                 'classnyh_REST',

                 'vdinsufm_TRIVIAL',
                 'vdinsufm_MILD',
                 'vdinsufm_MODERATE',
                 'vdinsufm_SEVERE',

                 'vdinsuft_TRIVIAL',
                 'vdinsuft_MILD',
                 'vdinsuft_MODERATE',
                 'vdinsuft_SEVERE',

                 'incidencREOP_FIRST',
                 'incidencREOP_SECOND',
                 'incidencREOP_THIRD',
                 'incidencREOP_FOURTH',

                 'status_URGENT',
                 'status_EMERGENCY',
                 'status_SALVAGE']

In [554]:
len(new_col_order)

23

- reordering columns
- syntax tip: if manually specifically column order instead of passing a list `df[['col_a', 'col_c', 'col_b']]`

In [555]:
recode_D_Dummies = recode_D_Dummies[new_col_order]

In [556]:
recode_D_Dummies.head()

Unnamed: 0,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE
0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [557]:
recode_D_Dummies.shape

(42740, 23)

### Recoding `recode_D_P` Features - Do Not Drop Reference Class when Create Dummies

In [558]:
recode_D_P

['cvdcarsten',
 'cvdstenrt',
 'cvdstenlft',
 'ArrhythAFlutter',
 'ArrhythAFib',
 'ArrhythAFibDur',
 'arrhythwhen']

In [559]:
replacement_dicts_alpha = [{1: 'NONE', 
                            2: 'RIGHT', 
                            3: 'LEFT', 
                            4: 'BOTH',
                            np.nan: 'NONE'}, #cvdcarsten
                           
                           {1: '80-99%', 
                            2: '100%', 
                            3: '50%-79%', 
                            4: 'NONE',
                            np.nan: 'NONE'}, #cvdstenrt
                           
                           {1: '80-99%', 
                            2: '100%', 
                            3: '50%-79%', 
                            4: 'NONE',
                            np.nan: 'NONE'}, #cvdstenlft
                           
                           {1: 'NONE', 
                            2: 'REMOTE', 
                            3: 'RECENT', 
                            np.nan: 'NONE'}, #ArrhythAFlutter
                           
                           {1: 'NONE', 
                            2: 'PAROXYSMAL', 
                            3: 'CONTINOUS', 
                            np.nan: 'NONE'}, #ArrhythAFib
                           
                           {1: 'SHORT', 
                            2: 'LONG', 
                            3: 'NONE', 
                            np.nan: 'NONE'}, #ArrhythAFibDur
                           
                           {1: 'SHORT', 
                            2: 'LONG', 
                            3: 'NONE', 
                            np.nan: 'NONE'}] #arrhythwhen

In [560]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


In [561]:
pre_op_df.shape

(42740, 83)

In [562]:
recode_D_P_df = pre_op_df.copy()[recode_D_P]

In [563]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
0,,,,,,,1.0
1,,,,,,,3.0
2,1.0,,,,,,3.0
3,,,,,,,1.0
4,,,,,,,1.0


In [564]:
recode_D_P_df.shape

(42740, 7)

In [565]:
name_replacement_zip = list(zip(recode_D_P, replacement_dicts_alpha))

In [566]:
name_replacement_zip

[('cvdcarsten', {1: 'NONE', 2: 'RIGHT', 3: 'LEFT', 4: 'BOTH', nan: 'NONE'}),
 ('cvdstenrt', {1: '80-99%', 2: '100%', 3: '50%-79%', 4: 'NONE', nan: 'NONE'}),
 ('cvdstenlft',
  {1: '80-99%', 2: '100%', 3: '50%-79%', 4: 'NONE', nan: 'NONE'}),
 ('ArrhythAFlutter', {1: 'NONE', 2: 'REMOTE', 3: 'RECENT', nan: 'NONE'}),
 ('ArrhythAFib', {1: 'NONE', 2: 'PAROXYSMAL', 3: 'CONTINOUS', nan: 'NONE'}),
 ('ArrhythAFibDur', {1: 'SHORT', 2: 'LONG', 3: 'NONE', nan: 'NONE'}),
 ('arrhythwhen', {1: 'SHORT', 2: 'LONG', 3: 'NONE', nan: 'NONE'})]

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D_P`

In [567]:
for column, dictionary in name_replacement_zip:
    recode_D_P_df[column] = recode_D_P_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [568]:
feature_name = []
orig_coding = []
new_coding = []

for column in recode_D_P:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(recode_D_P_df[column].unique()))

In [569]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding)),
             columns = ['feature', 'original_levels', 'new_levels'])

Unnamed: 0,feature,original_levels,new_levels
0,cvdcarsten,"[nan, 1.0, 2.0, 3.0, 4.0]","[BOTH, LEFT, NONE, RIGHT]"
1,cvdstenrt,"[nan, 1.0, 2.0, 3.0, 4.0]","[100%, 50%-79%, 80-99%, NONE]"
2,cvdstenlft,"[nan, 1.0, 2.0, 3.0, 4.0]","[100%, 50%-79%, 80-99%, NONE]"
3,ArrhythAFlutter,"[nan, 1.0, 2.0, 3.0]","[NONE, RECENT, REMOTE]"
4,ArrhythAFib,"[nan, 1.0, 2.0, 3.0]","[CONTINOUS, NONE, PAROXYSMAL]"
5,ArrhythAFibDur,"[nan, 1.0, 2.0, 3.0]","[LONG, NONE, SHORT]"
6,arrhythwhen,"[1.0, 2.0, 3.0, nan]","[LONG, NONE, SHORT]"


#### Creating Dummy Variables from Recoded Features
- going to use a copy to keep the recoded features for `Decision Trees`
- recoded features: `recode_D_P_df`
- recoded features with `dummy variables`: `recode_D_P_Dummies`

In [570]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
0,NONE,NONE,NONE,NONE,NONE,NONE,SHORT
1,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,NONE,NONE,NONE,NONE,NONE,NONE,SHORT
4,NONE,NONE,NONE,NONE,NONE,NONE,SHORT


In [571]:
print (len(recode_D_P))
print (recode_D_P_df.shape)
print (pre_op_df.shape)

7
(42740, 7)
(42740, 83)


#### Now creating `dummies`
- applying `pd.get_dummies()`

In [572]:
recode_D_P_Dummies = pd.get_dummies(recode_D_P_df.copy())

In [573]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_BOTH,cvdcarsten_LEFT,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdstenrt_100%,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_NONE,cvdstenlft_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_NONE,ArrhythAFlutter_NONE,ArrhythAFlutter_RECENT,ArrhythAFlutter_REMOTE,ArrhythAFib_CONTINOUS,ArrhythAFib_NONE,ArrhythAFib_PAROXYSMAL,ArrhythAFibDur_LONG,ArrhythAFibDur_NONE,ArrhythAFibDur_SHORT,arrhythwhen_LONG,arrhythwhen_NONE,arrhythwhen_SHORT
0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1
1,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0
2,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0
3,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1
4,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1


In [574]:
recode_D_P_Dummies.shape

(42740, 24)

- reordering columns for readability

In [575]:
recode_D_P_Dummies.columns.tolist()

['cvdcarsten_BOTH',
 'cvdcarsten_LEFT',
 'cvdcarsten_NONE',
 'cvdcarsten_RIGHT',
 'cvdstenrt_100%',
 'cvdstenrt_50%-79%',
 'cvdstenrt_80-99%',
 'cvdstenrt_NONE',
 'cvdstenlft_100%',
 'cvdstenlft_50%-79%',
 'cvdstenlft_80-99%',
 'cvdstenlft_NONE',
 'ArrhythAFlutter_NONE',
 'ArrhythAFlutter_RECENT',
 'ArrhythAFlutter_REMOTE',
 'ArrhythAFib_CONTINOUS',
 'ArrhythAFib_NONE',
 'ArrhythAFib_PAROXYSMAL',
 'ArrhythAFibDur_LONG',
 'ArrhythAFibDur_NONE',
 'ArrhythAFibDur_SHORT',
 'arrhythwhen_LONG',
 'arrhythwhen_NONE',
 'arrhythwhen_SHORT']

In [576]:
new_col_order = ['cvdcarsten_NONE',
                 'cvdcarsten_RIGHT',
                 'cvdcarsten_LEFT', 
                 'cvdcarsten_BOTH',

                 'cvdstenrt_NONE',
                 'cvdstenrt_50%-79%',
                 'cvdstenrt_80-99%',
                 'cvdstenrt_100%',

                 'cvdstenlft_NONE',
                 'cvdstenlft_50%-79%',
                 'cvdstenlft_80-99%',
                 'cvdstenlft_100%',

                 'ArrhythAFlutter_NONE',
                 'ArrhythAFlutter_REMOTE',
                 'ArrhythAFlutter_RECENT',

                 'ArrhythAFib_NONE',
                 'ArrhythAFib_PAROXYSMAL',
                 'ArrhythAFib_CONTINOUS',

                 'ArrhythAFibDur_NONE',
                 'ArrhythAFibDur_SHORT',
                 'ArrhythAFibDur_LONG',

                 'arrhythwhen_NONE',
                 'arrhythwhen_SHORT',
                 'arrhythwhen_LONG']

In [577]:
len(new_col_order)

24

- reordering columns

In [578]:
recode_D_P_Dummies = recode_D_P_Dummies[new_col_order]

In [579]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_NONE,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_NONE,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythAFlutter_NONE,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_NONE,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_NONE,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_NONE,arrhythwhen_SHORT,arrhythwhen_LONG
0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
2,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
3,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
4,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0


In [580]:
recode_D_P_Dummies.shape

(42740, 24)

### `datetime` Features

In [581]:
date_features

['surgdt']

In [582]:
dates_df = pre_op_df.copy()[date_features]

In [583]:
dates_df.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


In [584]:
dates_df.shape

(42740, 1)

#### Extracting additional features from `surgdt`

In [585]:
def date_components(data, col_labels):
    '''this function extracts date components from datetime objecr and recenters them
       where appropriate
    '''
    dates_frame = data.apply(lambda x: pd.Series([x.year, 
                                                  x.month,
                                                  x.day,
                                                  x.weekday()]))
    dates_frame.columns = col_labels
        
    return dates_frame

In [586]:
surgdt_col_labels = ['surgdt_year',
                     'surgdt_month',
                     'surgdt_DayOfMonth',
                     'surgdt_DayOfWeek']

In [587]:
surgdt_features = date_components(dates_df['surgdt'], surgdt_col_labels)

In [588]:
surgdt_features.head()

Unnamed: 0,surgdt_year,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,2011,7,1,4
1,2011,7,2,5
2,2011,7,4,0
3,2011,7,5,1
4,2011,7,6,2


In [589]:
surgdt_features.shape

(42740, 4)

In [590]:
weekday_dict = {0: "Mon",
                1: "Tues",
                2: "Wed",
                3: "Thurs",
                4: "Fri",
                5: "Sat",
                6: "Sun"}

- going to `dummy` code `DayOfWeek`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [591]:
surgdt_features = surgdt_features.replace({'surgdt_DayOfWeek': weekday_dict})

In [592]:
surgdt_features.head()

Unnamed: 0,surgdt_year,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,2011,7,1,Fri
1,2011,7,2,Sat
2,2011,7,4,Mon
3,2011,7,5,Tues
4,2011,7,6,Wed


In [593]:
sorted(surgdt_features['surgdt_month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [594]:
month_dict = {1: "Jan",
              2: "Feb",
              3: "Mar",
              4: "Apr",
              5: "May",
              6: "Jun",
              7: "Jul",
              8: "Aug",
              9: "Sep",
              10: "Oct",
              11: "Nov",
              12: "Dec"}

- going to `dummy` code `surgdt_month`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [595]:
surgdt_features = surgdt_features.replace({'surgdt_month': month_dict})

In [596]:
surgdt_features.head()

Unnamed: 0,surgdt_year,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,2011,Jul,1,Fri
1,2011,Jul,2,Sat
2,2011,Jul,4,Mon
3,2011,Jul,5,Tues
4,2011,Jul,6,Wed


In [597]:
print (surgdt_features['surgdt_DayOfWeek'].unique())
print (surgdt_features['surgdt_month'].unique())

['Fri' 'Sat' 'Mon' 'Tues' 'Wed' 'Thurs' 'Sun']
['Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec' 'Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun']


- going to `bin` `surgdt_DayOfMonth`

In [598]:
bins = [0, 10, 20, np.inf]
names = ['Beg', 'Mid', 'End']

In [599]:
surgdt_features['surgdt_PartOfMonth'] = pd.cut(surgdt_features['surgdt_DayOfMonth'],
                                               bins,
                                               labels=names)

In [600]:
surgdt_features.head()

Unnamed: 0,surgdt_year,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek,surgdt_PartOfMonth
0,2011,Jul,1,Fri,Beg
1,2011,Jul,2,Sat,Beg
2,2011,Jul,4,Mon,Beg
3,2011,Jul,5,Tues,Beg
4,2011,Jul,6,Wed,Beg


In [601]:
# testing binning
surgdt_features[surgdt_features['surgdt_DayOfMonth'] == 31].head()

Unnamed: 0,surgdt_year,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek,surgdt_PartOfMonth
69,2011,Aug,31,Wed,End
135,2011,Oct,31,Mon,End
136,2011,Oct,31,Mon,End
210,2012,Jan,31,Tues,End
211,2012,Jan,31,Tues,End


In [602]:
surgdt_features['surgdt_year'].dtypes

dtype('int64')

- to `binarize` new date features, need to convert `surgdt_year` to `category` and drop `surgdt_DayOfMonth`
- before apply `pd.get_dummies` save for `decision trees`

In [603]:
surgdt_features['surgdt_year'] = surgdt_features['surgdt_year'].astype('category')

In [604]:
surgdt_features['surgdt_year'].dtypes

category

In [605]:
surgdt_features = surgdt_features.drop('surgdt_DayOfMonth', axis=1)

In [606]:
surgdt_features.head()

Unnamed: 0,surgdt_year,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,2011,Jul,Fri,Beg
1,2011,Jul,Sat,Beg
2,2011,Jul,Mon,Beg
3,2011,Jul,Tues,Beg
4,2011,Jul,Wed,Beg


In [607]:
surgdt_features.shape

(42740, 4)

- keep `surgdt_features` for `decision trees`

In [608]:
surgdt_dummies = pd.get_dummies(surgdt_features.copy())

In [609]:
surgdt_dummies.head()

Unnamed: 0,surgdt_year_2011,surgdt_year_2012,surgdt_year_2013,surgdt_year_2014,surgdt_year_2015,surgdt_year_2016,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Jun,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_Mid,surgdt_PartOfMonth_End
0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0


- reordering columns for readability

In [610]:
surgdt_dummies.columns.tolist()

['surgdt_year_2011',
 'surgdt_year_2012',
 'surgdt_year_2013',
 'surgdt_year_2014',
 'surgdt_year_2015',
 'surgdt_year_2016',
 'surgdt_month_Apr',
 'surgdt_month_Aug',
 'surgdt_month_Dec',
 'surgdt_month_Feb',
 'surgdt_month_Jan',
 'surgdt_month_Jul',
 'surgdt_month_Jun',
 'surgdt_month_Mar',
 'surgdt_month_May',
 'surgdt_month_Nov',
 'surgdt_month_Oct',
 'surgdt_month_Sep',
 'surgdt_DayOfWeek_Fri',
 'surgdt_DayOfWeek_Mon',
 'surgdt_DayOfWeek_Sat',
 'surgdt_DayOfWeek_Sun',
 'surgdt_DayOfWeek_Thurs',
 'surgdt_DayOfWeek_Tues',
 'surgdt_DayOfWeek_Wed',
 'surgdt_PartOfMonth_Beg',
 'surgdt_PartOfMonth_Mid',
 'surgdt_PartOfMonth_End']

In [611]:
new_col_order = ['surgdt_year_2011',
                 'surgdt_year_2012',
                 'surgdt_year_2013',
                 'surgdt_year_2014',
                 'surgdt_year_2015',
                 'surgdt_year_2016',

                 'surgdt_month_Jan',
                 'surgdt_month_Feb',
                 'surgdt_month_Mar',
                 'surgdt_month_Apr',
                 'surgdt_month_May',
                 'surgdt_month_Jun',
                 'surgdt_month_Jul',
                 'surgdt_month_Aug',
                 'surgdt_month_Sep',
                 'surgdt_month_Oct',
                 'surgdt_month_Nov',
                 'surgdt_month_Dec',

                 'surgdt_DayOfWeek_Mon',
                 'surgdt_DayOfWeek_Tues',
                 'surgdt_DayOfWeek_Wed',
                 'surgdt_DayOfWeek_Thurs',
                 'surgdt_DayOfWeek_Fri',
                 'surgdt_DayOfWeek_Sat',
                 'surgdt_DayOfWeek_Sun',

                 'surgdt_PartOfMonth_Beg',
                 'surgdt_PartOfMonth_Mid',
                 'surgdt_PartOfMonth_End']

In [612]:
surgdt_dummies = surgdt_dummies[new_col_order]

In [613]:
surgdt_dummies.head()

Unnamed: 0,surgdt_year_2011,surgdt_year_2012,surgdt_year_2013,surgdt_year_2014,surgdt_year_2015,surgdt_year_2016,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jun,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_Mid,surgdt_PartOfMonth_End
0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


#### Need to decide if want to model excluding reference classes
- can use `df = df.drop([drop_cols], axis=1)`

### Additional `datetime` features from `dischdt`

In [614]:
dates_df.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


In [615]:
dischdt_features = dates_df.copy()

In [616]:
dischdt_features['dischdt_DayOfWeek'] = dischdt_features['dischdt'].apply(lambda x: pd.Series(x.weekday()))

KeyError: 'dischdt'

In [617]:
dischdt_features.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


- going to `dummy` code `DayOfWeek`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [618]:
dischdt_features = dischdt_features.replace({'dischdt_DayOfWeek': weekday_dict})

In [619]:
dischdt_features.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


In [620]:
dischdt_features['length_stay'] = dischdt_features['dischdt'] - dischdt_features['surgdt']

KeyError: 'dischdt'

In [None]:
dischdt_features.head()

In [621]:
dischdt_features['length_stay'].dtypes

KeyError: 'length_stay'

In [622]:
dischdt_features = dischdt_features.drop(['surgdt', 'dischdt'], axis=1)

ValueError: labels ['dischdt'] not contained in axis

In [623]:
dischdt_features.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


In [198]:
dischdt_features.shape

(42740, 2)

- keep `dischdt_features` for `decision trees`


#### Creating `dischdt_dummies`
- `dischdt_DayOfWeek` should be the only feature binarized by `pd.get_dummies`

In [199]:
dischdt_dummies = pd.get_dummies(dischdt_features.copy())

In [200]:
dischdt_dummies.head()

Unnamed: 0,length_stay,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Wed
0,5 days,0,0,0,0,0,0,1
1,7 days,0,0,1,0,0,0,0
2,8 days,0,0,0,0,0,1,0
3,4 days,0,0,1,0,0,0,0
4,4 days,0,0,0,1,0,0,0


- reordering columns

In [201]:
dischdt_dummies.columns.tolist()

['length_stay',
 'dischdt_DayOfWeek_Fri',
 'dischdt_DayOfWeek_Mon',
 'dischdt_DayOfWeek_Sat',
 'dischdt_DayOfWeek_Sun',
 'dischdt_DayOfWeek_Thurs',
 'dischdt_DayOfWeek_Tues',
 'dischdt_DayOfWeek_Wed']

In [202]:
new_col_order = ['length_stay',
 
                 'dischdt_DayOfWeek_Mon',
                 'dischdt_DayOfWeek_Tues',
                 'dischdt_DayOfWeek_Wed',
                 'dischdt_DayOfWeek_Thurs',
                 'dischdt_DayOfWeek_Fri',
                 'dischdt_DayOfWeek_Sat',
                 'dischdt_DayOfWeek_Sun']

In [203]:
dischdt_dummies = dischdt_dummies[new_col_order]

In [204]:
dischdt_dummies.head(1)

Unnamed: 0,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Wed,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun
0,5 days,0,0,1,0,0,0,0


### `numerical_features`

In [624]:
numerical_features

['recordId',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys']

In [625]:
numerical_features_df = pre_op_df.copy()[numerical_features]


- going to insert `surgdt` -- purpose will be clear when we have to divide dataset by dates

In [626]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


In [627]:
pre_op_df.shape

(42740, 83)

In [628]:
numerical_features.insert(1, 'surgdt')

In [629]:
numerical_features

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys']

In [630]:
len(numerical_features)

12

- creating `numerical_feature_df`

In [631]:
numerical_features_df['surgdt'] = pre_op_df['surgdt']

In [632]:
numerical_features_df.head()

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01
1,2,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,2011-07-02
2,3,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011-07-04
3,4,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011-07-05
4,5,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011-07-06


In [633]:
numerical_features_df.shape

(42740, 12)

- creating `bmi` numerical feature
- BMI is weight in kilograms (`weightkg`) divided by height in meters squared `(heightcm/100)^2)`

In [634]:
numerical_features_df['bmi'] = numerical_features_df['weightkg'] / np.power((numerical_features_df['heightcm']/100), 
                                                                            2)

In [635]:
numerical_features_df.head()

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,bmi
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,36.11111
1,2,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,2011-07-02,25.83787
2,3,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011-07-04,38.61754
3,4,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011-07-05,49.80469
4,5,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011-07-06,25.0


In [636]:
numerical_features_df.shape

(42740, 13)

- reordering columns

In [637]:
numerical_features_df.columns.tolist()

['recordId',
 'age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'surgdt',
 'bmi']

In [638]:
new_col_order = ['recordId',
                 'surgdt',
                 'age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys']

In [639]:
numerical_features_df = numerical_features_df[new_col_order]

In [640]:
numerical_features_df.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0


In [641]:
numerical_features_df.shape

(42740, 13)

## Assembling the Pre-Op Feature Matricies

- `yes_no_unc_df`

In [642]:
yes_no_unc_df = pre_op_df.copy()[yes_no_unc]

In [643]:
yes_no_unc_df.head()

Unnamed: 0,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,chf,priorhf,Arrhythmia,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [644]:
print (yes_no_unc_df.shape)
print (len(yes_no_unc))

(42740, 40)
40


In [645]:
len(yes_no_unc)

40

- `compress_to_two_df`

In [646]:
compress_to_two_df = pre_op_df.copy()[compress_to_two]

In [647]:
compress_to_two_df.head()

Unnamed: 0,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock,resusc,medasa,medaplt5days,medlipid,numdisv
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0


In [648]:
compress_to_two_df.shape

(42740, 14)

- need to rename `carshock` and `resusc` to `carshock24` and `resusc24`

In [649]:
compress_to_two_df = compress_to_two_df.rename(columns={'carshock': 'carshock24',
                                                        'resusc': 'resusc24'})

In [650]:
compress_to_two_df.head()

Unnamed: 0,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0


## `pre_op_X_tree`

In [652]:
pre_op_X_tree = pd.concat((numerical_features_df,
                           surgdt_features,
                           #dischdt_features,
                           yes_no_unc_df,
                           compress_to_two_df,
                           recode_D_df,
                           recode_D_P_df,
                          working_data['strokeBin']),
                          axis=1)

In [653]:
pre_op_X_tree.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_year,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,chf,priorhf,Arrhythmia,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,strokeBin
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011,Jul,Fri,Beg,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,SHORT,0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,2011,Jul,Sat,Beg,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,NONE,NONE,REST,NONE,MODERATE,MILD,NONE,EMERGENCY,NONE,NONE,NONE,NONE,NONE,NONE,NONE,0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011,Jul,Mon,Beg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0,NONE,NONE,NONE,NONE,MODERATE,MODERATE,NONE,URGENT,NONE,NONE,NONE,NONE,NONE,NONE,NONE,0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011,Jul,Tues,Beg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,NONE,NONE,NONE,NONE,SEVERE,MILD,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,SHORT,0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011,Jul,Wed,Beg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT,NONE,NONE,NONE,NONE,NONE,NONE,SHORT,0


In [654]:
pre_op_X_tree.shape

(42740, 87)

- reordering columns

In [655]:
pre_op_X_tree.columns.tolist()

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'bmi',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'surgdt_year',
 'surgdt_month',
 'surgdt_DayOfWeek',
 'surgdt_PartOfMonth',
 'gender',
 'racecaucasian',
 'raceblack',
 'raceasian',
 'racenativeam',
 'racnativepacific',
 'ethnicity',
 'diabetes',
 'dyslip',
 'dialysis',
 'hypertn',
 'infendo',
 'slpapn',
 'liverdis',
 'immsupp',
 'mediastrad',
 'cancer',
 'pvd',
 'ThAoDisease',
 'syncope',
 'unrespstat',
 'cvd',
 'cva',
 'cvdtia',
 'cvdpcarsurg',
 'hitanti',
 'cigsmoker',
 'cigsmokercurr',
 'prcvint',
 'prcab',
 'prvalve',
 'chf',
 'priorhf',
 'Arrhythmia',
 'arrhyafib',
 'medinotr',
 'hdefd',
 'vdaort',
 'vdstena',
 'vdstenm',
 'diabctrl',
 'infendty',
 'TobaccoUse',
 'chrlungd',
 'hmo2',
 'ivdrugab',
 'alcohol',
 'cvawhen',
 'carshock24',
 'resusc24',
 'medasa',
 'medaplt5days',
 'medlipid',
 'numdisv',
 'CardSympTimeOfAdm',
 'CardSympTimeOfSurg',
 'anginalclass',
 'classnyh',
 'vdinsufm',

In [656]:
new_col_order = ['recordId',
                 'surgdt',
                 'age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys',


                 'surgdt_year',
                 'surgdt_month',
                 'surgdt_DayOfWeek',
                 'surgdt_PartOfMonth',
                 #'dischdt_DayOfWeek',
                 #'length_stay',


                 'gender',
                 'racecaucasian',
                 'raceblack',
                 'raceasian',
                 'racenativeam',
                 'racnativepacific',
                 'ethnicity',

                 'diabetes',
                 'diabctrl',

                 'dyslip',
                 'dialysis',
                 'hypertn',

                 'infendo',
                 'infendty',

                 'slpapn',
                 'liverdis',
                 'immsupp',
                 'mediastrad',
                 'cancer',
                 'pvd',
                 'ThAoDisease',
                 'syncope',
                 'unrespstat',
                 'hitanti',


                 'TobaccoUse',
                 'cigsmoker',
                 'cigsmokercurr',
                 'chrlungd',


                 'prcvint',
                 'prcab',
                 'prvalve',
                 'chf',
                 'priorhf',
                 'medinotr',
                 'hdefd',
                 'vdaort',
                 'vdstena',
                 'vdstenm',


                 'hmo2',
                 'ivdrugab',
                 'alcohol',
                 'carshock24',
                 'resusc24',
                 'medasa',
                 'medaplt5days',
                 'medlipid',
                 'numdisv',
                 'CardSympTimeOfAdm',
                 'CardSympTimeOfSurg',
                 'anginalclass',
                 'classnyh',
                 'vdinsufm',
                 'vdinsuft',
                 'incidencREOP',
                 'status',


                 'cvd',
                 'cva',
                 'cvawhen',
                 'cvdtia',
                 'cvdpcarsurg',
                 'cvdcarsten',
                 'cvdstenrt',
                 'cvdstenlft',

                 'Arrhythmia',
                 'arrhyafib',
                 'ArrhythAFlutter',
                 'ArrhythAFib',
                 'ArrhythAFibDur',
                 'arrhythwhen',
                
                'strokeBin']

In [657]:
pre_op_X_tree = pre_op_X_tree[new_col_order]

In [658]:
pre_op_X_tree.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_year,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvd,cva,cvawhen,cvdtia,cvdpcarsurg,cvdcarsten,cvdstenrt,cvdstenlft,Arrhythmia,arrhyafib,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,strokeBin
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011,Jul,Fri,Beg,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,SEVERE,MILD,NONE,NONE,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT,0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,2011,Jul,Sat,Beg,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,NONE,NONE,REST,NONE,MODERATE,MILD,NONE,EMERGENCY,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011,Jul,Mon,Beg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,1.0,NONE,NONE,NONE,NONE,MODERATE,MODERATE,NONE,URGENT,1.0,1.0,1.0,1.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE,0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011,Jul,Tues,Beg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,NONE,NONE,NONE,NONE,SEVERE,MILD,NONE,NONE,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT,0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011,Jul,Wed,Beg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT,0


In [659]:
pre_op_X_tree.shape

(42740, 87)

## `pre_op_X`

In [660]:
pre_op_X = pd.concat((numerical_features_df,
                      surgdt_dummies,
                      #dischdt_dummies,
                      yes_no_unc_df,
                      compress_to_two_df,
                      recode_D_Dummies,
                      recode_D_P_Dummies,
                     working_data['strokeBin']),
                     axis=1)

In [661]:
pre_op_X.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_year_2011,surgdt_year_2012,surgdt_year_2013,surgdt_year_2014,surgdt_year_2015,surgdt_year_2016,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jun,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_Mid,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,chf,priorhf,Arrhythmia,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,TobaccoUse,chrlungd,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_NONE,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_NONE,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythAFlutter_NONE,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_NONE,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_NONE,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_NONE,arrhythwhen_SHORT,arrhythwhen_LONG,strokeBin
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0


In [662]:
pre_op_X.shape

(42740, 143)

- reordering columns

In [663]:
pre_op_X.columns.tolist()

['recordId',
 'surgdt',
 'age',
 'heightcm',
 'weightkg',
 'bmi',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'surgdt_year_2011',
 'surgdt_year_2012',
 'surgdt_year_2013',
 'surgdt_year_2014',
 'surgdt_year_2015',
 'surgdt_year_2016',
 'surgdt_month_Jan',
 'surgdt_month_Feb',
 'surgdt_month_Mar',
 'surgdt_month_Apr',
 'surgdt_month_May',
 'surgdt_month_Jun',
 'surgdt_month_Jul',
 'surgdt_month_Aug',
 'surgdt_month_Sep',
 'surgdt_month_Oct',
 'surgdt_month_Nov',
 'surgdt_month_Dec',
 'surgdt_DayOfWeek_Mon',
 'surgdt_DayOfWeek_Tues',
 'surgdt_DayOfWeek_Wed',
 'surgdt_DayOfWeek_Thurs',
 'surgdt_DayOfWeek_Fri',
 'surgdt_DayOfWeek_Sat',
 'surgdt_DayOfWeek_Sun',
 'surgdt_PartOfMonth_Beg',
 'surgdt_PartOfMonth_Mid',
 'surgdt_PartOfMonth_End',
 'gender',
 'racecaucasian',
 'raceblack',
 'raceasian',
 'racenativeam',
 'racnativepacific',
 'ethnicity',
 'diabetes',
 'dyslip',
 'dialysis',
 'hypertn',
 'infendo',
 'slpapn',
 'liverdis',
 'immsupp',
 'mediastrad'

In [664]:
new_col_order = ['recordId',
                 'surgdt',
                 'age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys',


                 'surgdt_year_2011',
                 'surgdt_year_2012',
                 'surgdt_year_2013',
                 'surgdt_year_2014',
                 'surgdt_year_2015',
                 'surgdt_year_2016',
                 'surgdt_month_Jan',
                 'surgdt_month_Feb',
                 'surgdt_month_Mar',
                 'surgdt_month_Apr',
                 'surgdt_month_May',
                 'surgdt_month_Jun',
                 'surgdt_month_Jul',
                 'surgdt_month_Aug',
                 'surgdt_month_Sep',
                 'surgdt_month_Oct',
                 'surgdt_month_Nov',
                 'surgdt_month_Dec',
                 'surgdt_DayOfWeek_Mon',
                 'surgdt_DayOfWeek_Tues',
                 'surgdt_DayOfWeek_Wed',
                 'surgdt_DayOfWeek_Thurs',
                 'surgdt_DayOfWeek_Fri',
                 'surgdt_DayOfWeek_Sat',
                 'surgdt_DayOfWeek_Sun',
                 'surgdt_PartOfMonth_Beg',
                 'surgdt_PartOfMonth_Mid',
                 'surgdt_PartOfMonth_End',
                 
                 
                 #'length_stay',
                 #'dischdt_DayOfWeek_Mon',
                 #'dischdt_DayOfWeek_Tues',
                 #'dischdt_DayOfWeek_Wed',
                 #'dischdt_DayOfWeek_Thurs',
                 #'dischdt_DayOfWeek_Fri',
                 #'dischdt_DayOfWeek_Sat',
                 #'dischdt_DayOfWeek_Sun',


                 'gender',
                 'racecaucasian',
                 'raceblack',
                 'raceasian',
                 'racenativeam',
                 'racnativepacific',
                 'ethnicity',


                 'diabetes',
                 'diabctrl',

                 'dyslip',
                 'dialysis',
                 'hypertn',

                 'infendo',
                 'infendty',

                 'slpapn',
                 'liverdis',
                 'immsupp',
                 'mediastrad',
                 'cancer',
                 'pvd',
                 'ThAoDisease',
                 'syncope',
                 'unrespstat',
                 'hitanti',
                 
                 
                 'TobaccoUse',
                 'cigsmoker',
                 'cigsmokercurr',
                 'chrlungd',
                 
                 
                 'prcvint',
                 'prcab',
                 'prvalve',
                 'chf',
                 'priorhf',
                 

                 'medinotr',
                 'hdefd',
                 'vdaort',
                 'vdstena',
                 'vdstenm',
                 

                 'hmo2',
                 'ivdrugab',
                 'alcohol',
                 'cvawhen',
                 'carshock24',
                 'resusc24',
                 'medasa',
                 'medaplt5days',
                 'medlipid',
                 'numdisv',


                 'CardSympTimeOfAdm_ANGINA',
                 'CardSympTimeOfAdm_STEMI',
                 'CardSympTimeOfSurg_ANGINA',
                 'CardSympTimeOfSurg_STEMI',
                 'anginalclass_SLIGHT',
                 'anginalclass_REST',
                 'classnyh_SLIGHT',
                 'classnyh_REST',
                 'vdinsufm_TRIVIAL',
                 'vdinsufm_MILD',
                 'vdinsufm_MODERATE',
                 'vdinsufm_SEVERE',
                 'vdinsuft_TRIVIAL',
                 'vdinsuft_MILD',
                 'vdinsuft_MODERATE',
                 'vdinsuft_SEVERE',
                 'incidencREOP_FIRST',
                 'incidencREOP_SECOND',
                 'incidencREOP_THIRD',
                 'incidencREOP_FOURTH',
                 'status_URGENT',
                 'status_EMERGENCY',
                 'status_SALVAGE',


                 'cvd',
                 'cva',
                 'cvdtia',
                 'cvdpcarsurg',
                 'cvdcarsten_NONE',
                 'cvdcarsten_RIGHT',
                 'cvdcarsten_LEFT',
                 'cvdcarsten_BOTH',
                 'cvdstenrt_NONE',
                 'cvdstenrt_50%-79%',
                 'cvdstenrt_80-99%',
                 'cvdstenrt_100%',
                 'cvdstenlft_NONE',
                 'cvdstenlft_50%-79%',
                 'cvdstenlft_80-99%',
                 'cvdstenlft_100%',


                 'Arrhythmia',
                 'arrhyafib',
                 'ArrhythAFlutter_NONE',
                 'ArrhythAFlutter_REMOTE',
                 'ArrhythAFlutter_RECENT',
                 'ArrhythAFib_NONE',
                 'ArrhythAFib_PAROXYSMAL',
                 'ArrhythAFib_CONTINOUS',
                 'ArrhythAFibDur_NONE',
                 'ArrhythAFibDur_SHORT',
                 'ArrhythAFibDur_LONG',
                 'arrhythwhen_NONE',
                 'arrhythwhen_SHORT',
                 'arrhythwhen_LONG',
                
                'strokeBin']

In [665]:
pre_op_X = pre_op_X[new_col_order]

In [666]:
pre_op_df.head(1)

Unnamed: 0,recordId,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,cigsmoker,cigsmokercurr,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,carshock_orig,resusc_orig
0,1,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,0.0,,,,1.0,0.0,1.0,0,0.0,1.0,1.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,2.0,2.0


### Open Items
- need to split `pre_op_X` and `pre_op_X_tree` by dates in databases 2.73 (2011 - 2014) and 2.81 (2014 - 2016)
- confirm the date
- drop `surgdt` and `recordId` from all subsets before modelling
- drop any columns that only apply to one of the databases - confirm features
- code to drop columns from a dataframe `df = df.drop(['colA', 'colB'], axis=1)`
- pickle the resulting dataframes for fast reuse
- outcome variable `y`

### Splitting `pre_op_X` and `pre_op_X_tree` by Date

#### `pre_op_X`

In [667]:
pre_op_X.shape

(42740, 143)

- `A` corresponds to 2.73 database and `B` corresponds to 2.81

In [668]:
pre_op_X_A = pre_op_X[pre_op_X['surgdt'] < '2014-01-01']

In [669]:
pre_op_X_A.shape

(19756, 143)

In [670]:
print (pre_op_X_A['surgdt'].min())
print (pre_op_X_A['surgdt'].max())

2011-07-01 00:00:00
2013-12-31 00:00:00


In [718]:
pre_op_X_B = pre_op_X[pre_op_X['surgdt'] >= '2014-01-01']

In [672]:
pre_op_X_B.shape

(22984, 143)

In [673]:
print (pre_op_X_B['surgdt'].min())
print (pre_op_X_B['surgdt'].max())

2014-01-01 00:00:00
2016-12-31 00:00:00


In [674]:
print (pre_op_X_A.shape[0] + pre_op_X_B.shape[0])
print (pre_op_X.shape[0])

42740
42740


#### `pre_op_X_tree`

In [675]:
pre_op_X_tree.shape

(42740, 87)

- `A` corresponds to 2.73 database and `B` corresponds to 2.81

In [676]:
pre_op_X_tree_A = pre_op_X_tree[pre_op_X_tree['surgdt'] < '2014-01-01']

In [677]:
pre_op_X_tree_A.shape

(19756, 87)

In [678]:
print (pre_op_X_tree_A['surgdt'].min())
print (pre_op_X_tree_A['surgdt'].max())

2011-07-01 00:00:00
2013-12-31 00:00:00


In [679]:
pre_op_X_tree_B = pre_op_X_tree[pre_op_X_tree['surgdt'] >= '2014-01-01']

In [680]:
pre_op_X_tree_B.shape

(22984, 87)

In [681]:
print (pre_op_X_tree_B['surgdt'].min())
print (pre_op_X_tree_B['surgdt'].max())

2014-01-01 00:00:00
2016-12-31 00:00:00


In [682]:
print (pre_op_X_tree_A.shape[0] + pre_op_X_tree_B.shape[0])
print (pre_op_X.shape[0])

42740
42740


### Pickling Final Files

#### For `decision trees`

`pre_op_X_tree`, `pre_op_X_tree_A` and  `pre_op_X_tree_B`

In [271]:
pre_op_X_tree.to_pickle('../data/pre_op_features_tree.pkl')

In [273]:
pre_op_X_tree_A.to_pickle('../data/pre_op_features_tree_A.pkl')

In [274]:
pre_op_X_tree_B.to_pickle('../data/pre_op_features_tree_B.pkl')

#### For All Other Models

`pre_op_X`, `pre_op_X_A` and  `pre_op_X_B`

In [275]:
pre_op_X.to_pickle('../data/pre_op_features.pkl')

In [276]:
pre_op_X_A.to_pickle('../data/pre_op_features_A.pkl')

In [277]:
pre_op_X_B.to_pickle('../data/pre_op_features_B.pkl')

### Create train-dev-test splits

Below we use stratified shuffled splitting to make sure we get a representative split of our two classes into our three subsets. We also fill in the missing numerical data with their respective medians during this process to avoid data spilling over from test to train or vice versa

In [725]:
def splitTrainTest(df, name, numerical_features):
    
    #create test set
    y = df.pop('strokeBin')
    x = df
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for train_index, test_index in sss.split(x, y):
        X_train, X_nonTrain = x.iloc[train_index], x.iloc[test_index]
        y_train, y_nonTrain = y.iloc[train_index], y.iloc[test_index]
    
    #create dev and test set
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
    for dev_index, test_index in sss.split(X_nonTrain, y_nonTrain):
        X_dev, X_test = X_nonTrain.iloc[dev_index], X_nonTrain.iloc[test_index]
        y_dev, y_test = y_nonTrain.iloc[dev_index], y_nonTrain.iloc[test_index]
    
    train = X_train
    #replace missing numerical values with median
    for col in numerical_features:
        train[col].fillna((train[col].median()), inplace = True)
    train['strokeBin'] = y_train
    fileName = "../data/" + name + "_train.pkl"
    train.to_pickle(fileName)
    
    dev = X_dev
    #replace missing numerical values with median
    for col in numerical_features:
        dev[col].fillna((dev[col].median()), inplace = True)
    dev['strokeBin'] = y_dev
    fileName = "../data/" + name + "_dev.pkl"
    dev.to_pickle(fileName)
    
    test = X_test
    #replace missing numerical values with median
    for col in numerical_features:
        test[col].fillna((test[col].median()), inplace = True)
    test['strokeBin'] = y_test
    fileName = "../data/" + name + "_test.pkl"
    test.to_pickle(fileName)

In [726]:
pre_op_X_B = pre_op_X[pre_op_X['surgdt'] >= '2014-01-01']
splitTrainTest(pre_op_X_B,"pre_op_X_B", original_numerical_features)

In [2]:
trainData = pd.read_pickle('../data/pre_op_X_B_train.pkl')

In [728]:
trainData['strokeBin'].sum()

265

In [731]:
def checkForRemainingNulls(df):
    nullCols = 0
    for col in df.columns: 
        numNull = df[col].isnull().sum()
        if numNull >0:
            print(str(col) + " " + str(numNull))
            nullCols += 1
    if(nullCols == 0):
        print("No null values found")

In [732]:
checkForRemainingNulls(trainData)

No null values found


## Oversample the minority class

Let's start with the built in balanced weighting of sci-kit learn's Logistic Regression model

In [3]:
Y_Train = trainData.pop('strokeBin')
trainData.pop('surgdt')
X_Train = trainData

In [743]:
clf = LogisticRegression(random_state=0, solver='lbfgs',class_weight='balanced').fit(X_Train, Y_Train)

In [14]:
devData = pd.read_pickle('../data/pre_op_X_B_dev.pkl')
Y_Dev = devData.pop('strokeBin')
devData.pop('surgdt')
X_Dev = devData

In [748]:
y_pred = clf.predict(X_Dev)

In [754]:
clf.score(X_Dev, Y_Dev) #accuracy

0.59878154917319404

In [751]:
precision_recall_fscore_support(Y_Dev, y_pred, average='macro')

(0.50163457615830809, 0.52771422837648008, 0.38924697211486131, None)

In [752]:
precision_recall_fscore_support(Y_Dev, y_pred, average='micro')

(0.59878154917319404, 0.59878154917319404, 0.59878154917319404, None)

In [753]:
precision_recall_fscore_support(Y_Dev, y_pred, average='weighted')

(0.97300858435227677, 0.59878154917319404, 0.73670698399762413, None)

In [755]:
precision_recall_fscore_support(Y_Dev, y_pred, average='binary')

(0.016322089227421111, 0.45454545454545453, 0.031512605042016806, None)

In [756]:
precision_recall_fscore_support(Y_Dev, y_pred)

(array([ 0.98694706,  0.01632209]),
 array([ 0.600883  ,  0.45454545]),
 array([ 0.74698134,  0.03151261]),
 array([2265,   33]))

Recall is pretty consistently low (<.6) regardless of how we calculate it

Let's try using a random oversampler to increase the amount of data we have

In [42]:
ros = RandomOverSampler(1, random_state = 4) #50-50 split stroke to no-stroke
X_res, y_res = ros.fit_sample(X=X_Train, y=Y_Train)

In [43]:
X_res.shape

(36244, 141)

In [44]:
X_Train.shape

(18387, 141)

In [45]:
Y_Train.sum()

265

In [46]:
y_res.sum()

18122

In [47]:
y_res.sum()/X_res.shape[0]

0.5

In [48]:
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_res, y_res)
y_pred = clf.predict(X_Dev)
clf.score(X_Dev, Y_Dev) #accuracy



0.5630983463881636

In [49]:
precision_recall_fscore_support(Y_Dev, y_pred)

(array([0.98762568, 0.01691542]),
 array([0.56379691, 0.51515152]),
 array([0.717819 , 0.0327553]),
 array([2265,   33]))

In [72]:
def predictForSamplingRatio(ratio, xTrain, yTrain, xDev, yDev):
    ros = RandomOverSampler(ratio, random_state = 4) #50-50 split stroke to no-stroke
    X_res, y_res = ros.fit_sample(X=xTrain, y=yTrain)
    clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=400).fit(X_res, y_res)
    print("Score: " + str(clf.score(xDev, yDev))) #accuracy
    y_pred = clf.predict(xDev)
    return y_pred

In [73]:
yPred = predictForSamplingRatio(1, X_Train, Y_Train, X_Dev, Y_Dev)
precision_recall_fscore_support(Y_Dev, yPred)

Score: 0.6227154046997389




(array([0.98609179, 0.01511628]),
 array([0.62604857, 0.39393939]),
 array([0.76586551, 0.02911534]),
 array([2265,   33]))

In [70]:
yPred = predictForSamplingRatio(0.25, X_Train, Y_Train, X_Dev, Y_Dev)
precision_recall_fscore_support(Y_Dev, yPred)

Score: 0.9765013054830287




(array([0.98550725, 0.        ]),
 array([0.99072848, 0.        ]),
 array([0.98811096, 0.        ]),
 array([2265,   33]))

In [71]:
yPred = predictForSamplingRatio(0.75, X_Train, Y_Train, X_Dev, Y_Dev)
precision_recall_fscore_support(Y_Dev, yPred)

Score: 0.8137510879025239


(array([0.9872679 , 0.02179177]),
 array([0.82163355, 0.27272727]),
 array([0.89686747, 0.04035874]),
 array([2265,   33]))

In [74]:
ros = RandomOverSampler('minority', random_state = 4) #50-50 split stroke to no-stroke
X_res, y_res = ros.fit_sample(X=X_Train, y=Y_Train)
clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=400).fit(X_res, y_res)
print("Score: " + str(clf.score(X_Dev, Y_Dev))) #accuracy
y_pred = clf.predict(X_Dev)
precision_recall_fscore_support(Y_Dev, y_pred)

Score: 0.6227154046997389




(array([0.98609179, 0.01511628]),
 array([0.62604857, 0.39393939]),
 array([0.76586551, 0.02911534]),
 array([2265,   33]))