## Capstone Project
 
### Pre- and Post Operating Features Cleaning and Encoding 

#### - Combined `2.73` and `2.81`

#### - Dropping Reference Class From All Dummies

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# scikit-learn libraries for feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFECV

# scikit-learn libraries for learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.9 |Anaconda custom (x86_64)| (default, Jul 30 2019, 13:42:17) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [4]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.2
Running numpy version: 1.17.2
Running sklearn version: 0.21.3


In [5]:
os.getcwd()

'/Users/Shailesh/Desktop/MIDS/Capstone/w210CapstoneProject/notebooks'

In [6]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.Capstone - STS risk factor list.xlsx.icloud',
 '.DS_Store',
 '.capstone_STS_risk_factor_features.xlsx.icloud',
 '.capstone_data-version-2.xlsx.icloud',
 '.capstone_data.xlsx.icloud',
 '.capstone_data_binarized_outcome.xlsx.icloud',
 '.capstone_data_filled_in_complication_data.xlsx.icloud',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'POSTOP_ALL_col_names_11_9.pkl',
 'POSTOP_TREE_ALL_col_names_11_9.pkl',
 'POSTOP_categorical_TREE_col_names_11_9.pkl',
 'POSTOP_categorical_col_names_11_9.pkl',
 'POSTOP_numerical_col_names_11_9.pkl',
 'PREOP_ALL_col_names_11_9.pkl',
 'PREOP_TREE_ALL_col_names_11_9.pkl',
 'PREOP_categorical_TREE_col_names_11_9.pkl',
 'PREOP_categorical_col_names_11_9.pkl',
 'PREOP_dataset_10_24.pkl',
 'PREOP_dataset_10_27.pkl',
 'PREOP_dataset_TREE_10_24.pkl',
 'PREOP_dataset_TREE_10_27.pkl',
 'PREOP_numerical_col_names_11_9.pkl',
 'PRE_

#### Loading Dataset

In [3]:
#raw_data = pd.read_pickle('capstone_data_binarized_outcome.pkl')
raw_data = pd.read_excel('/Users/Shailesh/Desktop/MIDS/Capstone/w210CapstoneProject/capstone_data-version-armusproctyp.xlsx')#, sheetname='output')

In [8]:
raw_data.head()

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,armusproctype,procTypeCAB,procTypeValve,procTypeValveandCAB
0,0,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,0.014,0.048,9,0,0,1
1,1,65,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-02,2011-07-09,175.3,79.4,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,45.0,1.2,,,3.0,1.0,,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,1.0,2,,1.0,2.0,2,2.0,,,,,,,,,,1.0,2.0,2.0,2.0,4.0,,,,1.0,55.0,,44.0,32.0,1.0,40.0,1.0,2.0,,,,,,,,,,,3.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,10.0,,,,,,,10.0,,,,,,,1.0,3.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-02,2011-07-03,,,,,32.0,,29.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,4.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,3.0,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.1,,,2.0,,2.0,2.0,2.0,1.0,2.0,,2.0,,,,,,2.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,1.0,2.0,,NaT,,,,,,0.017,0.069,2,1,0,0
2,2,83,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-04,2011-07-12,162.60001,102.1,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,,29.0,1.2,3.3,6.2,3.0,1.0,8.6,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,1.0,2,,2.0,,1,2.0,,,,,,,,,,2.0,1.0,1.0,2.0,4.0,,,,1.0,60.0,,31.0,50.0,1.0,36.0,1.0,1.0,,1.5,16.0,,,,,,,,3.0,2.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-04,2011-07-04,,,,,,,,,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,2.0,1.0,,2.0,2.0,,,,,1.4,,,2.0,,2.0,2.0,2.0,1.0,2.0,,2.0,,,,,,1.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,1.0,2.0,,NaT,,,,,,0.045,0.148,2,1,0,0
3,3,59,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-05,2011-07-09,160.0,127.5,1.0,4.0,2.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,,2.0,2.0,2.0,,,,,,,,,35.0,0.9,3.5,7.4,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,1.0,2.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,2.0,,,,1.0,60.0,,33.0,51.0,1.0,35.0,2.0,,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-05,2011-07-05,,,,,34.8,,19.0,,3,,,2.0,1.0,2.0,,2.0,73.0,2.0,,,,,,2.0,47.0,3.0,1.0,2.0,2.0,2.0,,,,,1.0,2.0,0.0,0.0,0.0,,,2.0,2.0,1.0,1.0,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,2.0,1.0,2.0,,2.0,,,,,,2.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,2.0,2.0,,NaT,,,,,,0.013,0.074,9,0,0,1
4,4,72,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-06,2011-07-10,160.0,64.0,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,37.0,0.9,3.8,5.7,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,60.0,,21.0,40.0,1.0,40.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,34.6,,19.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,,1.0,1.0,0.0,0.0,0.0,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,0.8,,,2.0,,2.0,2.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,0.016,0.019,2,1,0,0


In [12]:
cabg = raw_data.loc[raw_data['procTypeCAB']==1]
valve = raw_data.loc[raw_data['procTypeValve']==1]
cabvalve = raw_data.loc[raw_data['procTypeValveandCAB']==1]

In [10]:
raw_data.shape

(42746, 412)

In [14]:
cabvalve.drop(['armusproctype', 'procTypeCAB', 'procTypeValve', 'procTypeValveandCAB'], axis=1)
valve.drop(['armusproctype', 'procTypeCAB', 'procTypeValve', 'procTypeValveandCAB'], axis=1)
cabg.drop(['armusproctype', 'procTypeCAB', 'procTypeValve', 'procTypeValveandCAB'], axis=1)

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf
1,1,65,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-02,2011-07-09,175.30000,79.4,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,45.0,1.20,,,3.0,1.0,,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,1.0,2,,1.0,2.0,2,2.0,,,,,,,,,,1.0,2.0,2.0,2.0,4.0,,,,1.0,55.0,,44.0,32.0,1.0,40.0,1.0,2.0,,,,,,,,,,,3.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,10.0,,,,,,,10.0,,,,,,,1.0,3.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-02,2011-07-03,,,,,32.0,,29.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,4.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,3.0,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.1,,,2.0,,2.0,2.0,2.0,1.0,2.0,,2.0,,,,,,2.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,1.0,2.0,,NaT,,,,,,0.017,0.069
2,2,83,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-04,2011-07-12,162.60001,102.1,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,,29.0,1.20,3.3,6.2,3.0,1.0,8.60,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,1.0,2,,2.0,,1,2.0,,,,,,,,,,2.0,1.0,1.0,2.0,4.0,,,,1.0,60.0,,31.0,50.0,1.0,36.0,1.0,1.0,,1.5,16.0,,,,,,,,3.0,2.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-04,2011-07-04,,,,,,,,,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,2.0,1.0,,2.0,2.0,,,,,1.4,,,2.0,,2.0,2.0,2.0,1.0,2.0,,2.0,,,,,,1.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,1.0,2.0,,NaT,,,,,,0.045,0.148
4,4,72,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-06,2011-07-10,160.00000,64.0,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,37.0,0.90,3.8,5.7,3.0,1.0,6.40,2.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,60.0,,21.0,40.0,1.0,40.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,34.6,,19.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,,1.0,1.0,0.0,0.0,0.0,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,0.8,,,2.0,,2.0,2.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,0.016,0.019
5,5,72,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-06,2011-07-15,162.60001,104.5,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,1.0,1.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,40.0,0.90,3.4,5.4,3.0,1.0,6.40,2.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,1.0,4.0,2.0,2.0,2.0,,,,,,,,1.0,,,1.0,1.0,2,,1.0,1.0,2,2.0,,,,,,,,,,2.0,2.0,2.0,2.0,3.0,,,,1.0,40.0,,,,1.0,40.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,35.1,,28.0,,3,,,2.0,1.0,2.0,,2.0,65.0,2.0,,,,,,2.0,45.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.2,,,2.0,,1.0,1.0,2.0,1.0,2.0,,2.0,,,,,,1.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,1.0,2.0,,NaT,,,,,,0.009,0.021
6,6,57,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-07,2011-07-13,162.60001,114.3,1.0,3.0,1.0,2.0,1,2.0,,,3.0,,2.0,1.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,36.0,1.00,3.6,6.9,3.0,1.0,6.60,1.0,2.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,1.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,60.0,,45.0,31.0,1.0,22.0,2.0,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-07,2011-07-07,,,,,31.0,,18.0,,3,,,2.0,1.0,2.0,,2.0,95.0,2.0,,,,,,2.0,75.0,4.0,1.0,2.0,2.0,2.0,,,,,1.0,2.0,0.0,0.0,0.0,,,1.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.0,,,2.0,,2.0,2.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,0.006,0.023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42740,42740,69,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2016-12-12,2016-12-19,167.60001,84.5,1.0,3.0,1.0,2.0,1,2.0,,1.0,6.0,,2.0,2.0,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,11.7,35.6,0.90,3.4,11.6,3.0,1.0,6.40,,,,,,2.0,,,,,,,,,,,,,,,4.0,4.0,5.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,,2.0,,1.0,38.0,2.0,,,2.0,,1.0,2.0,,,,1.0,,,,,,,2.0,2.0,,,1.0,,,,1.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-12-12,2016-12-12,,,,,36.1,3.0,29.0,135.0,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,3.0,2.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,3.0,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,94.0,0.8,,,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,NaT,,,,,,0.013,0.027
42741,42741,62,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-19,2016-12-23,182.89999,98.3,2.0,,1.0,2.0,1,2.0,,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,14.4,43.3,0.90,4.1,5.5,3.0,1.0,6.40,,,,,,2.0,,,,,,,,,,,,,,,4.0,4.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,1.0,1.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,50.0,2.0,,1.0,50.0,1.0,33.0,49.0,1.0,33.0,2.0,,,,,,,,,,,,0.0,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-12-19,2016-12-19,,,,,34.9,3.0,30.0,133.0,3,,,2.0,1.0,2.0,2.0,2.0,77.0,2.0,,,,,,2.0,66.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,4.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,156.0,0.8,,,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,NaT,,,,,,0.007,0.013
42743,42743,66,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-21,2016-12-27,175.30000,75.3,2.0,,1.0,2.0,1,2.0,,5.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,15.5,46.2,0.83,3.8,5.3,3.0,1.1,7.47,,,,,,1.0,2.0,2.0,,,,,2.0,,,,,,,,4.0,4.0,4.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,1.0,3.0,,2.0,,1.0,58.0,2.0,,,2.0,,2.0,,,,,,,,,,,,5.0,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,1.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-12-21,2016-12-21,,,,,34.6,3.0,41.0,116.0,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,6.0,4.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,140.0,0.8,,,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,NaT,,,,,,0.007,0.008
42744,42744,62,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2016-12-30,2017-01-09,165.10001,107.5,2.0,,1.0,2.0,1,2.0,,1.0,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,15.2,46.1,0.77,3.8,5.3,3.0,1.1,7.47,,,,,,2.0,,,,,,,,,,,,,,,3.0,3.0,3.0,2.0,,2.0,2.0,2.0,2.0,,,,,,,,,,2.0,1.0,2,,2.0,,2,2.0,,,2.0,,,,2.0,,,2.0,2.0,1.0,2.0,4.0,70.0,2.0,,1.0,55.0,2.0,,,2.0,,2.0,,,,,,,,,,,,1.0,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,24.0,1.0,2.0,,3,2,1.0,2.0,2.0,2016-12-30,2016-12-30,,,,,35.3,3.0,32.0,166.0,3,,,2.0,1.0,2.0,2.0,2.0,72.0,2.0,,,,,,2.0,54.0,2.0,1.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,,,,,4.0,,1.0,2.0,1.0,1.0,1.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,NaT,,,1,2.0,,1.0,1.0,2.0,2.0,2.0,1.0,,2.0,132.0,0.7,,,2.0,,2.0,2.0,2.0,1.0,2.0,,2.0,,,,,,2.0,2.0,2.0,,2.0,2.0,1.0,,,2.0,,,2.0,2.0,2.0,,,2.0,,,,2.0,2.0,,2.0,2.0,2.0,,2.0,1.0,2.0,2.0,NaT,,,,,,0.008,0.012


In [10]:
col_names = raw_data.columns.tolist()

In [16]:
cabg.to_pickle('../data/cabg_raw.pkl')
valve.to_pickle('../data/valve_raw.pkl')

In [11]:
col_names.sort()

In [12]:
col_names[0:5]

['ADEt1', 'ADEt2', 'ADEt3', 'ADLesTAneur', 'ADLesTCoarcNar']

#### Categorical Features
- Working list prior to 10/22/19

#### Pre-Op Categorical Features 
- as of 10/23/19

In [13]:
cat_features = ['gender',
                'racecaucasian',
                'raceblack',
                'raceasian',
                'racenativeam',
                'racnativepacific',
                'ethnicity',

                'diabetes',
                'diabctrl', 
                'dyslip',
                'dialysis',
                'hypertn', 
                'infendo',
                'infendty',

                'TobaccoUse', #combine with `cigsmoker` - PER 10-24
                'cigsmoker',  #combine with `TobaccoUse` - PER 10-24

                'chrlungd', 

                'hmo2', #CHANGING CODING PER 10-26 GUIDE
                'slpapn', 
                'ivdrugab', 
                'alcohol', 
                'liverdis',  
                'immsupp', 
                'mediastrad',  
                'cancer', 
                'pvd',  

                'syncope', 
                'unrespstat',  
                'cvd', 
                'cva', 

                'cvdtia', 

                'cvdcarsten', 

                'cvdstenrt', #H - RECODED PER 10-26 CODING GUIDE
                'cvdstenlft', #H - RECODED PER 10-26 CODING GUIDE
                'cvdpcarsurg', 
                'hitanti',
                'prcvint', 
                'prcab',
                'prvalve',

                'CardSympTimeOfAdm', #H -- BUT ALL NaNs in 2.73 -- DELETED 10-24
                'CardSympTimeOfSurg', #H -- BUT ALL NaNs in 2.73 -- DELETED 10-24

                'anginalclass', # RECODED PER 10-26 CODING GUIDE
                'chf', 
                'classnyh', # RECODED PER 10-26 CODING GUIDE
                'priorhf', 
                'carshock', 
                'resusc', 

                'Arrhythmia', ## -- NO ANALOG IN 2.73 -- ALL NaNs -- DELETED 10-24

                'ArrhythAFlutter', ## does not make sense to combine with 'ArrhythAFib' -- DELETED 10-24
                'ArrhythAFib', ## does not make sense to combine with 'ArrhythAFlutter' -- DELETED 10-24

                'ArrhythAFibDur', #GOING TO DELETE PER 10-26 GUIDE
                'arrhythwhen',    #GOING TO DELETE PER 10-26 GUIDE

                'arrhyafib',

                'medasa', 
                'medaplt5days', 
                'medinotr',
                'medlipid', 
                'numdisv', # CHANGING CODING TO MULTI-LEVEL PER 10-26 CODING GUIDE
                'hdefd',

                'vdaort',
                'vdstena',
                'vdinsufm', 
                'vdstenm', 
                'vdinsuft', 
                'incidenc', 
                'status']

In [14]:
len(cat_features)

66

#### Pre-Op Numerical Features
- as of 10/23/19

In [15]:
num_features = ['age',
                'heightcm',
                'weightkg',
                'hct',
                'creatlst',
                'totalbumin',
                'a1clvl',
                'meldscr',
                'hdef',
                'pasys']

In [16]:
len(num_features)

10

#### Pre-Op Date Features
- as of 10/23/19

In [17]:
date_features = ['surgdt']

In [18]:
len(date_features)

1

#### Outcome and Other Features
- as of 10/23/19

In [19]:
outcome_other = ['recordId', # keeping for now for auditing purposes
                 'predstro', # STS predicted probability of stroke
                 'cnstrokp',
                 'cnstrokttia',
                 #'cnstroktrind', # FOUND TO BE ALL `NaN`
                 'cncomaenceph',
                 'strokeBin'] # adding strokeBin to compare STS model prediction to actual outcome

In [20]:
len(outcome_other)

6

## Data Cleaning

### Step 1. Deleting Rows with `NaN`s in Essential Columns

In [21]:
working_data = raw_data.copy()[(raw_data['gender'].notnull()) & 
                               (raw_data['heightcm'].notnull()) & 
                               (raw_data['weightkg'].notnull())]

In [22]:
working_data.shape

(42740, 409)

- saving `working_data` as a `.pkl` File

In [23]:
# working_data.to_pickle('capstone_data_key_variable_nulls_cleaned_10_23.pkl')

### Step 2. Creating `pre_op_data`

In [24]:
pre_op_cols = num_features + date_features + cat_features + outcome_other

In [25]:
len(pre_op_cols)

83

In [26]:
pre_op_data = working_data.copy()[pre_op_cols]

In [27]:
pre_op_data.head()

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,3.0,2.0,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,1.0,1.0,1,0.014,2,2,2,0
1,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,2011-07-02,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,3.0,1.0,2.0,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,2,2.0,2.0,4.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,3.0,2,0.017,2,2,1,0
2,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011-07-04,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,,,2.0,3.0,1.0,2.0,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,1,1.0,1.0,4.0,1.0,1.0,1.0,3.0,2.0,3.0,1.0,2.0,3,0.045,2,2,1,0
3,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011-07-05,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,2.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,2.0,2.0,2.0,,,,,,,3.0,2.0,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,1.0,,2.0,2,2.0,1.0,2.0,1.0,2.0,,4.0,2.0,2.0,1.0,1.0,4,0.013,2,2,1,0
4,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011-07-06,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,3.0,2.0,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,2.0,,,,,1.0,2.0,5,0.016,2,2,2,0


In [28]:
pre_op_data.shape

(42740, 83)

- making a copy of `pre_op_df` for check if recoding was done correctly

In [29]:
orig_pre = pre_op_data.copy()

In [30]:
orig_pre.shape, pre_op_data.shape

((42740, 83), (42740, 83))

### Step 3. Recoding Columns per Data Dictionary (as of 10/22/19)

#### - recoding `yes_no_unc` columns

In [31]:
yes_no_unc = ['gender',
              'racecaucasian',
              'raceblack',
              'raceasian',
              'racenativeam',
              'racnativepacific',
              'ethnicity',
              'diabetes',
              'dyslip',
              'dialysis',
              'hypertn',
              'infendo',
              'slpapn',
              'liverdis',
              'immsupp',
              'mediastrad',
              'cancer',
              'pvd',
              
              'syncope',
              'unrespstat',
              'cvd',
              'cva',
              'cvdtia',
              'cvdpcarsurg',
              'hitanti',
              'cigsmoker',
             
              'prcvint',
              'prcab',
              'prvalve',
              'chf',
              'priorhf',
              #'Arrhythmia', ## Removing because all `NaN` for 2.73 - no way to harmonize with `2.81`
              'arrhyafib',
              'medinotr',
              'hdefd',
              'vdaort',
              'vdstena',
              'vdstenm']

In [32]:
len(yes_no_unc)

37

In [33]:
for column in yes_no_unc:
    pre_op_data[column] = pre_op_data[column].replace({1: 1, 2: 0, 3: 0, np.nan: 0})

- there should be no `NaN`s

In [34]:
pre_op_data[yes_no_unc].isnull().sum()

gender              0
racecaucasian       0
raceblack           0
raceasian           0
racenativeam        0
racnativepacific    0
ethnicity           0
diabetes            0
dyslip              0
dialysis            0
hypertn             0
infendo             0
slpapn              0
liverdis            0
immsupp             0
mediastrad          0
cancer              0
pvd                 0
syncope             0
unrespstat          0
cvd                 0
cva                 0
cvdtia              0
cvdpcarsurg         0
hitanti             0
cigsmoker           0
prcvint             0
prcab               0
prvalve             0
chf                 0
priorhf             0
arrhyafib           0
medinotr            0
hdefd               0
vdaort              0
vdstena             0
vdstenm             0
dtype: int64

- checking recoding against original in `orig_pre`

In [35]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in yes_no_unc:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_data[column].unique()))
    dtypes.append(pre_op_data[column].dtype)

In [36]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'data_type'])

Unnamed: 0,feature,original_levels,new_levels,data_type
0,gender,"[1.0, 2.0]","[0.0, 1.0]",float64
1,racecaucasian,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
2,raceblack,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
3,raceasian,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
4,racenativeam,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
5,racnativepacific,"[2.0, nan, 1.0]","[0.0, 1.0]",float64
6,ethnicity,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64
7,diabetes,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64
8,dyslip,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64
9,dialysis,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64


#### - recoding `compress_to_two` columns
- compressing from > 3 original categories to `Yes`/`No`, `1`/`0`

In [37]:
compress_to_two = ['diabctrl',
                   'infendty',
                   'TobaccoUse',
                   'chrlungd',
                   'hmo2', # going to be recoded per 10-26
                   'ivdrugab',
                   'alcohol',
                   #'cvawhen', # deleted per 10/22 changes - 0.95 correlation to `cva`
                   'carshock',
                   'resusc',
                   'medasa',
                   'medaplt5days',
                   'medlipid']
                   #'numdisv' # recoded from binary variable to multi-level per 10-26 GUIDE

In [38]:
len(compress_to_two)

12

- creating `list` of `replacement_dicts`

In [39]:
replacement_dicts = [{1: 0, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0, 7: 0, np.nan: 0}, #diabctrl
                     {1: 0, 2: 1, 3: 0, np.nan: 0}, #infendty
                     {1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, np.nan: 0}, #TobaccoUse
                     {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, 6: 0, np.nan: 0}, #chrlungd
                     {1: 1, 2: 0, 3: 1, 4: 1, 5: 0, np.nan: 0}, #hmo2 - CHANGED PER 10-26 GUIDE, 1 now maps to 1
                     {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, np.nan: 0}, #ivdrugab
                     {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, np.nan: 0}, #alcohol
                     #{1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}, #cvawhen
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #carshock -- RENAME to `carshock24`
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #resusc -- RENAME to `resusc24`
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medasa
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medaplt5days
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}] #medlipid
                     #{1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}] #numdisv - going to go from binary to multi-level

In [40]:
print (len(compress_to_two))
print (len(replacement_dicts))

12
12


- since we need to rename columns, before recoding, want to keep orignal coding for auditing purposes

In [41]:
pre_op_data.shape

(42740, 83)

In [42]:
pre_op_data['carshock_orig'] = pre_op_data['carshock']
pre_op_data['resusc_orig'] = pre_op_data['resusc']

In [43]:
# added two columns
pre_op_data.shape

(42740, 85)

- now recoding the features in `compress_to_two`

In [44]:
name_replacement_zip = list(zip(compress_to_two, replacement_dicts))

- iterate through `name_replacement_zip` and apply `replacement_dicts` to features in `compress_to_two`

In [45]:
for column, dictionary in name_replacement_zip:
    pre_op_data[column] = pre_op_data[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [46]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in compress_to_two:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_data[column].unique()))
    dtypes.append(pre_op_data[column].dtype)

In [47]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'dtypes'])

Unnamed: 0,feature,original_levels,new_levels,dtypes
0,diabctrl,"[1.0, nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[0.0, 1.0]",float64
1,infendty,"[nan, 1.0, 2.0]","[0.0, 1.0]",float64
2,TobaccoUse,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]","[0.0, 1.0]",float64
3,chrlungd,"[1.0, 2.0, 3.0, 4.0, nan, 5.0, 6.0]","[0.0, 1.0]",float64
4,hmo2,"[1.0, 2.0, 3.0, 4.0, nan, 5.0]","[0.0, 1.0]",float64
5,ivdrugab,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[0.0, 1.0]",float64
6,alcohol,"[1.0, 2.0, 3.0, nan, 4.0, 5.0]","[0.0, 1.0]",float64
7,carshock,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]",float64
8,resusc,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]",float64
9,medasa,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]",float64


### Combining `TobaccoUse` and `cigsmoker`
- `TobaccoUse` is all `NaN` for `2.73` observations

In [48]:
pre_op_data['TobaccoUse'].value_counts()

0.0    30135
1.0    12605
Name: TobaccoUse, dtype: int64

In [49]:
pre_op_data['cigsmoker'].value_counts()

0.0    36074
1.0     6666
Name: cigsmoker, dtype: int64

- `TobaccoUse` and `cigsmoker` do not overlap so it makes sense to combine

In [50]:
pre_op_data[(pre_op_data['TobaccoUse'] == 1) & (pre_op_data['cigsmoker'] == 1)] 

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,carshock_orig,resusc_orig


#### Creating `Tobacco_smoker_combo`

In [51]:
pre_op_data['Tobacco_Combined'] = pre_op_data['TobaccoUse'] + pre_op_data['cigsmoker']

In [52]:
pre_op_data['Tobacco_Combined'].value_counts()

0.0    23469
1.0    19271
Name: Tobacco_Combined, dtype: int64

#### Editing `yes_no_unc` and `compress_to_two` feature lists to reflect new feature combination - `Tobacco_Combined`

In [53]:
yes_no_unc = ['gender',
              'racecaucasian',
              'raceblack',
              'raceasian',
              'racenativeam',
              'racnativepacific',
              'ethnicity',
              'diabetes',
              'dyslip',
              'dialysis',
              'hypertn',
              'infendo',
              'slpapn',
              'liverdis',
              'immsupp',
              'mediastrad',
              'cancer',
              'pvd',
              
              'syncope',
              'unrespstat',
              'cvd',
              'cva',
              'cvdtia',
              'cvdpcarsurg',
              'hitanti',
              #'cigsmoker', # combined with `TobaccoUse` in `compress_to_two`
             
              'prcvint',
              'prcab',
              'prvalve',
              'chf',
              'priorhf',
              #'Arrhythmia', ## Removing because all `NaN` for 2.73 - no way to harmonize with `2.81`
              'arrhyafib',
              'medinotr',
              'hdefd',
              'vdaort',
              'vdstena',
              'vdstenm']

In [54]:
len(yes_no_unc)

36

In [55]:
compress_to_two = ['diabctrl',
                   'infendty',
                   'Tobacco_Combined',
                   #'TobaccoUse', # combined with `cigsmoker` in `yes_no_unc`
                   'chrlungd',
                   'hmo2',
                   'ivdrugab',
                   'alcohol',
                   #'cvawhen', # deleted per 10/22 changes - 0.95 correlation to `cva`
                   'carshock',
                   'resusc',
                   'medasa',
                   'medaplt5days',
                   'medlipid']
                   # 'numdisv'# recoding from binary to multi-level per 10-26 GUIDE

In [56]:
len(compress_to_two)

12

In [57]:
len(yes_no_unc) + len(compress_to_two)

48

### Recoding `recode_D` Features - Will Need to Specify a Reference Class when Create Dummies

In [58]:
recode_D = ['numdisv', # recoding from binary to multi-level per 10-26 GUIDE
            'anginalclass',
            'classnyh',
            'vdinsufm',
            'vdinsuft',
            'incidenc',
            'status'] # got rid of 'CardSympTimeOfAdm' and 'CardSympTimeOfSurg' b/c not applicable to 2.73

In [59]:
replacement_dicts_alpha = [{1: 'NONE',
                            2: '1_CORONARY',
                            3: '2_CORONARIES',
                            4: '3_CORONARIES',
                            np.nan: 'NONE'}, #numdisv recooded from binary to multi-level per 10-26 GUIDE
                                             #will now result in 4 dummies (3 + 1 reference class)
                                             # +3 net features in ALL vs 10-24, TREE feature count unchanged
    
                           {1: 'NONE', 
                            2: 'STRENUOUS_ACTIVITY', # RECODED FROM `SLIGHT` PER 10-26
                            3: 'SLIGHT_LIMITATION_ACTIVITY', # RECODED FROM `SLIGHT` PER 10-26
                            4: 'MARKED_LIMITATION_ACTIVITY', # RECODED FROM `REST` PER 10-26
                            5: 'ANGINA_AT_REST', # RECODED FROM `REST` PER 10-26
                            np.nan: 'NONE'}, #anginalclass -- will now result in 5 dummies (4 + 1 ref class)
                                             # +2 net features in ALL vs 10-24, TREE feature count unchanged
                           
                           {1: 'NONE', 
                            2: 'SLIGHT_LIMITATION', # RECODED FROM `SLIGHT` PER 10-26 
                            3: 'MARKED_LIMITATION', # RECODED FROM `SLIGHT` PER 10-26 
                            4: 'ANY_ACTIVITY', # RECODED FROM `REST` PER 10-26
                            np.nan: 'NONE'}, #classnyh - will now result in 4 dummies (3 + 1 ref class)
                                             # +1 net features in ALL vs 10-24, TREE feature count unchanged
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsufm
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsuft
                           
                           {1: 'NONE', 
                            2: 'FIRST', 
                            3: 'SECOND', 
                            4: 'THIRD',
                            5: 'FOURTH', 
                            np.nan: 'NONE'}, #incidenc -- NEED TO RENAME incidence_REOP
                           
                           {1: 'NONE', 
                            2: 'URGENT', 
                            3: 'EMERGENCY', 
                            4: 'SALVAGE',
                            np.nan: 'NONE'}] #status

- going to work on a subset of `pre_op_data`

In [60]:
pre_op_data.head(1)

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,carshock_orig,resusc_orig,Tobacco_Combined
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,,,,,1.0,0.0,1.0,0,0.0,1.0,4.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,1,0.014,2,2,2,0,2.0,2.0,1.0


In [61]:
pre_op_data.shape

(42740, 86)

In [62]:
recode_D_df = pre_op_data.copy()[recode_D]

In [63]:
recode_D_df.shape

(42740, 7)

In [64]:
name_replacement_zip = list(zip(recode_D, replacement_dicts_alpha))

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D`

In [65]:
for column, dictionary in name_replacement_zip:
    recode_D_df[column] = recode_D_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [66]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in recode_D:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(recode_D_df[column].unique()))
    dtypes.append(recode_D_df[column].dtype)

In [67]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'dtype'])

Unnamed: 0,feature,original_levels,new_levels,dtype
0,numdisv,"[1.0, 2.0, 3.0, 4.0, nan]","[1_CORONARY, 2_CORONARIES, 3_CORONARIES, NONE]",object
1,anginalclass,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[ANGINA_AT_REST, MARKED_LIMITATION_ACTIVITY, NONE, SLIGHT_LIMITATION_ACTIVITY, STRENUOUS_ACTIVITY]",object
2,classnyh,"[nan, 1.0, 2.0, 3.0, 4.0]","[ANY_ACTIVITY, MARKED_LIMITATION, NONE, SLIGHT_LIMITATION]",object
3,vdinsufm,"[0.0, 1.0, 2.0, 3.0, 4.0, nan, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]",object
4,vdinsuft,"[2.0, 3.0, nan, 0.0, 1.0, 4.0, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]",object
5,incidenc,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[FIRST, FOURTH, NONE, SECOND, THIRD]",object
6,status,"[1.0, 2.0, 3.0, nan, 4.0]","[EMERGENCY, NONE, SALVAGE, URGENT]",object


#### Validating `recode_D_df`

In [68]:
recode_D_df.head()

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidenc,status
0,3_CORONARIES,MARKED_LIMITATION_ACTIVITY,NONE,SEVERE,MILD,NONE,NONE
1,3_CORONARIES,ANGINA_AT_REST,NONE,MODERATE,MILD,NONE,EMERGENCY
2,3_CORONARIES,NONE,NONE,MODERATE,MODERATE,NONE,URGENT
3,1_CORONARY,NONE,NONE,SEVERE,MILD,NONE,NONE
4,3_CORONARIES,ANGINA_AT_REST,NONE,NONE,NONE,NONE,URGENT


In [69]:
print (len(recode_D))
print (recode_D_df.shape)
print (pre_op_data.shape)

7
(42740, 7)
(42740, 86)


- renaming `incidenc` to `incidencREOP` for the `recode_D_df` for use in `Decision Trees`

In [70]:
recode_D_df = recode_D_df.rename(columns={'incidenc': 'incidencREOP'})

In [71]:
recode_D_df.head(1)

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,3_CORONARIES,MARKED_LIMITATION_ACTIVITY,NONE,SEVERE,MILD,NONE,NONE


### Now creating `recode_D_sklearn` for use in `sklearn` `Decision Trees`
- transform `text` codes to `numeric` for use in `sklearn` ML algos

In [72]:
recode_D_sklearn = recode_D_df.copy()

In [73]:
recode_D_df.shape, recode_D_sklearn.shape

((42740, 7), (42740, 7))

In [74]:
recode_D_sklearn.head(2)

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,3_CORONARIES,MARKED_LIMITATION_ACTIVITY,NONE,SEVERE,MILD,NONE,NONE
1,3_CORONARIES,ANGINA_AT_REST,NONE,MODERATE,MILD,NONE,EMERGENCY


- recoding `numdisv`

In [75]:
recode_D_sklearn['numdisv'].value_counts()

3_CORONARIES    25071
2_CORONARIES     7608
NONE             6828
1_CORONARY       3233
Name: numdisv, dtype: int64

In [76]:
recode_D_sklearn = recode_D_sklearn.replace({'numdisv': {'NONE': 0,
                                                         '1_CORONARY': 1,
                                                         '2_CORONARIES': 2,
                                                         '3_CORONARIES': 3}})

In [77]:
recode_D_sklearn['numdisv'].value_counts()

3    25071
2     7608
0     6828
1     3233
Name: numdisv, dtype: int64

- recoding `anginalclass`

In [78]:
recode_D_sklearn['anginalclass'].value_counts()

NONE                          13470
MARKED_LIMITATION_ACTIVITY    11988
ANGINA_AT_REST                 9915
SLIGHT_LIMITATION_ACTIVITY     5806
STRENUOUS_ACTIVITY             1561
Name: anginalclass, dtype: int64

In [79]:
recode_D_sklearn = recode_D_sklearn.replace({'anginalclass': {'NONE': 0, 
                                                              'STRENUOUS_ACTIVITY': 1, 
                                                              'SLIGHT_LIMITATION_ACTIVITY': 2,
                                                              'MARKED_LIMITATION_ACTIVITY': 3, 
                                                              'ANGINA_AT_REST': 4}})

In [80]:
recode_D_sklearn['anginalclass'].value_counts()

0    13470
3    11988
4     9915
2     5806
1     1561
Name: anginalclass, dtype: int64

- recoding `classnyh`

In [81]:
recode_D_sklearn['classnyh'].value_counts()

NONE                 34835
MARKED_LIMITATION     3604
ANY_ACTIVITY          2258
SLIGHT_LIMITATION     2043
Name: classnyh, dtype: int64

In [82]:
recode_D_sklearn = recode_D_sklearn.replace({'classnyh': {'NONE': 0, 
                                                          'SLIGHT_LIMITATION': 1, 
                                                          'MARKED_LIMITATION': 2, 
                                                          'ANY_ACTIVITY': 3}})

In [83]:
recode_D_sklearn['classnyh'].value_counts()

0    34835
2     3604
3     2258
1     2043
Name: classnyh, dtype: int64

- recoding `vdinsufm`

In [84]:
recode_D_sklearn['vdinsufm'].value_counts()

NONE        15876
MILD        10695
TRIVIAL      9276
SEVERE       3528
MODERATE     3365
Name: vdinsufm, dtype: int64

In [85]:
recode_D_sklearn = recode_D_sklearn.replace({'vdinsufm': {'NONE': 0,
                                                          'TRIVIAL': 1,
                                                          'MILD': 2,
                                                          'MODERATE': 3,
                                                          'SEVERE': 4}})

In [86]:
recode_D_sklearn['vdinsufm'].value_counts()

0    15876
2    10695
1     9276
4     3528
3     3365
Name: vdinsufm, dtype: int64

- recoding `vdinsuft`

In [87]:
recode_D_sklearn['vdinsuft'].value_counts()

NONE        18166
TRIVIAL     11418
MILD        10705
MODERATE     2212
SEVERE        239
Name: vdinsuft, dtype: int64

In [88]:
recode_D_sklearn = recode_D_sklearn.replace({'vdinsuft': {'NONE': 0,
                                                          'TRIVIAL': 1,
                                                          'MILD': 2,
                                                          'MODERATE': 3,
                                                          'SEVERE': 4}})

In [89]:
recode_D_sklearn['vdinsuft'].value_counts()

0    18166
1    11418
2    10705
3     2212
4      239
Name: vdinsuft, dtype: int64

- recoding `incidencREOP`

In [90]:
recode_D_sklearn['incidencREOP'].value_counts()

NONE      40408
FIRST      2174
SECOND      130
THIRD        19
FOURTH        9
Name: incidencREOP, dtype: int64

In [91]:
recode_D_sklearn = recode_D_sklearn.replace({'incidencREOP': {'NONE': 0,
                                                              'FIRST': 1,
                                                              'SECOND': 2,
                                                              'THIRD': 3,
                                                              'FOURTH': 4}})

In [92]:
recode_D_sklearn['incidencREOP'].value_counts()

0    40408
1     2174
2      130
3       19
4        9
Name: incidencREOP, dtype: int64

- recoding `status`

In [93]:
recode_D_sklearn['status'].value_counts()

NONE         21016
URGENT       20731
EMERGENCY      966
SALVAGE         27
Name: status, dtype: int64

In [94]:
recode_D_sklearn = recode_D_sklearn.replace({'status': {'NONE': 0,
                                                        'URGENT': 1,
                                                        'EMERGENCY': 2,
                                                        'SALVAGE': 3}})

In [95]:
recode_D_sklearn['status'].value_counts()

0    21016
1    20731
2      966
3       27
Name: status, dtype: int64

- checking `dtypes`

In [96]:
recode_D_sklearn.dtypes

numdisv         int64
anginalclass    int64
classnyh        int64
vdinsufm        int64
vdinsuft        int64
incidencREOP    int64
status          int64
dtype: object

In [97]:
recode_D_sklearn.head()

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,3,3,0,4,2,0,0
1,3,4,0,3,2,0,2
2,3,0,0,3,3,0,1
3,1,0,0,4,2,0,0
4,3,4,0,0,0,0,1


In [98]:
pre_op_data.shape, recode_D_df.shape, recode_D_sklearn.shape

((42740, 86), (42740, 7), (42740, 7))

### Creating Dummy Variables from Recoded Features - `recode_D_Dummies`

#### Now creating `dummies`

- applying `pd.get_dummies()`

In [99]:
recode_D_Dummies = pd.get_dummies(recode_D_df.copy())

In [100]:
recode_D_Dummies.head()

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,numdisv_NONE,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_NONE,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_NONE,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_NONE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_NONE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_NONE,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_NONE,status_SALVAGE,status_URGENT
0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


In [101]:
print (recode_D_df.shape)
print (recode_D_Dummies.shape)

(42740, 7)
(42740, 32)


- now need to eliminate reference classes
- identifying nan columns to drop

In [102]:
drop_cols = [col for col in recode_D_Dummies.columns if col.endswith('_NONE')]

In [103]:
drop_cols

['numdisv_NONE',
 'anginalclass_NONE',
 'classnyh_NONE',
 'vdinsufm_NONE',
 'vdinsuft_NONE',
 'incidencREOP_NONE',
 'status_NONE']

In [104]:
len(drop_cols)

7

- dropping the columns

In [105]:
recode_D_Dummies = recode_D_Dummies.drop(drop_cols, axis=1)

In [106]:
recode_D_Dummies.head()

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT
0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [107]:
recode_D_Dummies.shape

(42740, 25)

- reordering the columns for readability

In [108]:
new_col_order = ['numdisv_1_CORONARY',
                 'numdisv_2_CORONARIES',
                 'numdisv_3_CORONARIES',
                 
                 'anginalclass_STRENUOUS_ACTIVITY',
                 'anginalclass_SLIGHT_LIMITATION_ACTIVITY',
                 'anginalclass_MARKED_LIMITATION_ACTIVITY',
                 'anginalclass_ANGINA_AT_REST',
                 
                 'classnyh_SLIGHT_LIMITATION',
                 'classnyh_MARKED_LIMITATION',
                 'classnyh_ANY_ACTIVITY',
    
                 'vdinsufm_TRIVIAL',
                 'vdinsufm_MILD',
                 'vdinsufm_MODERATE',
                 'vdinsufm_SEVERE',

                 'vdinsuft_TRIVIAL',
                 'vdinsuft_MILD',
                 'vdinsuft_MODERATE',
                 'vdinsuft_SEVERE',

                 'incidencREOP_FIRST',
                 'incidencREOP_SECOND',
                 'incidencREOP_THIRD',
                 'incidencREOP_FOURTH',

                 'status_URGENT',
                 'status_EMERGENCY',
                 'status_SALVAGE']

In [109]:
len(new_col_order)

25

- reordering columns
- syntax tip: if manually specifically column order instead of passing a list `df[['col_a', 'col_c', 'col_b']]`

In [110]:
recode_D_Dummies = recode_D_Dummies[new_col_order]

In [111]:
recode_D_Dummies.head()

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE
0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [112]:
recode_D_Dummies.shape

(42740, 25)

### Recoding `recode_D_P` Features - Dropping Reference Class when Create Dummies

In [113]:
recode_D_P = ['cvdcarsten',
              'cvdstenrt',   # will be recoded per 10-26 GUIDE
              'cvdstenlft']  # will be recoded per 10-26 GUIDE
              #'arrhythwhen' # prior version combined `ArrhythAFibDur` and `arrhythwhen`
                             # 10-26 GUIDE discarded both features

- did not recode the following features in this iteration due to all `NaN`s in `2.73` and no way to harmonize them between `2.73` and `2.81`
- the following features were discarded as a result as of the 10-26 GUIDE
- `ArrhythAFlutter`
- `ArrhythAFib`
- `ArrhythAFibDur`
- `arrhythwhen`

#### Defining `replacement_dicts` for `recode_D_P` Features

In [114]:
replacement_dicts_alpha = [{1: 'NONE', 
                            2: 'RIGHT', 
                            3: 'LEFT', 
                            4: 'BOTH',
                            np.nan: 'NONE'}, #cvdcarsten
                           
                           {1: '80-99%', 
                            2: '100%', 
                            #3: '50%-79%', # `3` NOW MAPS TO `NONE` PER 10-26, was `50%-79%` in 10-24
                            3: 'NONE', # NEW CODING PER 10-26 GUIDE
                            4: 'NONE',
                            np.nan: 'NONE'}, #cvdstenrt - net change of 1 feature in ALL vs 24, no change TREE
                           
                           {1: '80-99%', 
                            2: '100%', 
                            #3: '50%-79%', # `3` NOW MAPS TO `NONE` PER 10-26, was `50%-79%` in 10-24
                            3: 'NONE', # NEW CODING PER 10-26 GUIDE
                            4: 'NONE',
                            np.nan: 'NONE'}] #cvdstenlft - net change of 1 feature in ALL vs 24, no change TREE
                           
                           #{1: 'SHORT', 
                           #2: 'LONG', 
                           #3: 'NONE', 
                           #0: 'NONE'}] #ArrhythDur_when_Combo - DISCARDED PER 10-26 GUIDE
                                        #reduction in 3 dummies (2 + 1 ref class)
                                        # net -2 features in ALL vs 10-24, TREE version -1

In [115]:
pre_op_data.head(1)

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,carshock_orig,resusc_orig,Tobacco_Combined
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,,,,,1.0,0.0,1.0,0,0.0,1.0,4.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,1,0.014,2,2,2,0,2.0,2.0,1.0


In [116]:
pre_op_data.shape

(42740, 86)

#### Replacing `arrhythwhen` with `ArrhythDur_when_Combo` in `recode_D_P`

In [117]:
recode_D_P = ['cvdcarsten',
              'cvdstenrt',
              'cvdstenlft']
              #'ArrhythDur_when_Combo'] # DISCARDED PER 10-26 GUIDE

- creating subset of features `recode_D_P`

In [118]:
recode_D_P_df = pre_op_data.copy()[recode_D_P]

In [119]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,,,
1,,,
2,1.0,,
3,,,
4,,,


In [120]:
recode_D_P_df.shape

(42740, 3)

- `zip`ping together `recode_D_P` feature list and `replacement_dicts_alpha`

In [121]:
name_replacement_zip = list(zip(recode_D_P, replacement_dicts_alpha))

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D_P`

In [122]:
for column, dictionary in name_replacement_zip:
    recode_D_P_df[column] = recode_D_P_df[column].replace(dictionary)

#### Validating `recode_D_P_df`

In [123]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,NONE,NONE,NONE
1,NONE,NONE,NONE
2,NONE,NONE,NONE
3,NONE,NONE,NONE
4,NONE,NONE,NONE


In [124]:
print (len(recode_D_P))
print (recode_D_P_df.shape)
print (pre_op_data.shape)

3
(42740, 3)
(42740, 86)


### Now creating `recode_D_P_sklearn` for use in `sklearn` `Decision Trees`
- transform `text` codes to `numeric` for use in `sklearn` ML algos

In [125]:
recode_D_P_sklearn = recode_D_P_df.copy()

In [126]:
recode_D_P_df.shape, recode_D_P_sklearn.shape

((42740, 3), (42740, 3))

In [127]:
recode_D_P_sklearn.head(2)

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,NONE,NONE,NONE
1,NONE,NONE,NONE


- recoding `cvdcarsten`

In [128]:
recode_D_P_sklearn['cvdcarsten'].value_counts()

NONE     39666
RIGHT     1116
LEFT      1056
BOTH       902
Name: cvdcarsten, dtype: int64

In [129]:
recode_D_P_sklearn = recode_D_P_sklearn.replace({'cvdcarsten': {'NONE': 0,
                                                                'RIGHT': 1,
                                                                'LEFT': 2,
                                                                'BOTH': 3}})

In [130]:
recode_D_P_sklearn['cvdcarsten'].value_counts()

0    39666
1     1116
2     1056
3      902
Name: cvdcarsten, dtype: int64

- recoding `cvdstenrt`

In [131]:
recode_D_P_sklearn['cvdstenrt'].value_counts()

NONE      42134
80-99%      403
100%        203
Name: cvdstenrt, dtype: int64

In [132]:
recode_D_P_sklearn = recode_D_P_sklearn.replace({'cvdstenrt': {'NONE': 0,
                                                               '80-99%': 1,
                                                               '100%': 2}})

In [133]:
recode_D_P_sklearn['cvdstenrt'].value_counts()

0    42134
1      403
2      203
Name: cvdstenrt, dtype: int64

- recoding `cvdstenlft`

In [134]:
recode_D_P_sklearn['cvdstenlft'].value_counts()

NONE      42209
80-99%      363
100%        168
Name: cvdstenlft, dtype: int64

In [135]:
recode_D_P_sklearn = recode_D_P_sklearn.replace({'cvdstenlft': {'NONE': 0,
                                                                '80-99%': 1,
                                                                '100%': 2}})

In [136]:
recode_D_P_sklearn['cvdstenlft'].value_counts()

0    42209
1      363
2      168
Name: cvdstenlft, dtype: int64

- checking `dtypes`

In [137]:
recode_D_P_sklearn.dtypes

cvdcarsten    int64
cvdstenrt     int64
cvdstenlft    int64
dtype: object

In [138]:
recode_D_P_sklearn.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [139]:
pre_op_data.shape, recode_D_P_df.shape, recode_D_P_sklearn.shape

((42740, 86), (42740, 3), (42740, 3))

### Creating Dummy Variables from Recoded Features - `recode_D_P_Dummies`

#### Now creating `dummies`
- applying `pd.get_dummies()`

In [140]:
recode_D_P_Dummies = pd.get_dummies(recode_D_P_df.copy())

In [141]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_BOTH,cvdcarsten_LEFT,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdstenrt_100%,cvdstenrt_80-99%,cvdstenrt_NONE,cvdstenlft_100%,cvdstenlft_80-99%,cvdstenlft_NONE
0,0,0,1,0,0,0,1,0,0,1
1,0,0,1,0,0,0,1,0,0,1
2,0,0,1,0,0,0,1,0,0,1
3,0,0,1,0,0,0,1,0,0,1
4,0,0,1,0,0,0,1,0,0,1


In [142]:
recode_D_P_Dummies.shape

(42740, 10)

#### Getting Rid of Reference Classes
- identifying `NaN` or `NONE` columns to drop

In [143]:
drop_cols = [col for col in recode_D_P_Dummies.columns if col.endswith('_NONE')]

In [144]:
drop_cols

['cvdcarsten_NONE', 'cvdstenrt_NONE', 'cvdstenlft_NONE']

In [145]:
len(drop_cols)

3

In [146]:
recode_D_P_Dummies.shape

(42740, 10)

- dropping the columns

In [147]:
recode_D_P_Dummies = recode_D_P_Dummies.drop(drop_cols, axis=1)

In [148]:
recode_D_P_Dummies.shape

(42740, 7)

- reordering columns for readability

In [149]:
new_col_order = ['cvdcarsten_RIGHT',
                 'cvdcarsten_LEFT', 
                 'cvdcarsten_BOTH',
                 
                 #'cvdstenrt_50%-79%', # RECODED TO `NONE` OR REFERENCE CLASS PER 10-26 GUIDE
                 'cvdstenrt_80-99%',
                 'cvdstenrt_100%',
                 
                 #'cvdstenlft_50%-79%', # RECODED TO `NONE` OR REFERENCE CLASS PER 10-26 GUIDE
                 'cvdstenlft_80-99%',
                 'cvdstenlft_100%']

                 #'ArrhythDur_when_Combo_SHORT', # DISCARDED PER 10-26 GUIDE
                 #'ArrhythDur_when_Combo_LONG']  # DISCARDED PER 10-26 GUIDE

In [150]:
len(new_col_order)

7

- reordering columns

In [151]:
recode_D_P_Dummies = recode_D_P_Dummies[new_col_order]

In [152]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [153]:
recode_D_P_Dummies.shape

(42740, 7)

### `datetime` Features

In [154]:
date_features

['surgdt']

In [155]:
dates_df = pre_op_data.copy()[date_features]

In [156]:
dates_df.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


In [157]:
dates_df.shape

(42740, 1)

#### Extracting additional features from `surgdt`

In [158]:
def date_components(data, col_labels):
    '''this function extracts date components from datetime objecr and recenters them
       where appropriate
    '''
    dates_frame = data.apply(lambda x: pd.Series([x.month,
                                                  x.day,
                                                  x.weekday()]))
    dates_frame.columns = col_labels
        
    return dates_frame

In [159]:
surgdt_col_labels = ['surgdt_month',
                     'surgdt_DayOfMonth',
                     'surgdt_DayOfWeek']

In [160]:
surgdt_features = date_components(dates_df['surgdt'], surgdt_col_labels)

In [161]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,7,1,4
1,7,2,5
2,7,4,0
3,7,5,1
4,7,6,2


- checking unique values created by `date_components` function

In [162]:
sorted(surgdt_features['surgdt_month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [163]:
sorted(surgdt_features['surgdt_DayOfMonth'].unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31]

- according to the `datetime` documentation `Monday` is coded as `0` and `Sunday` as a `6`

In [164]:
sorted(surgdt_features['surgdt_DayOfWeek'].unique())

[0, 1, 2, 3, 4, 5, 6]

- going to `bin` `surgdt_DayOfMonth`

In [165]:
bins = [0, 10, 20, np.inf]
names = [1, 2, 3]

In [166]:
surgdt_features['surgdt_PartOfMonth'] = pd.cut(surgdt_features['surgdt_DayOfMonth'],
                                               bins,
                                               labels=names)

In [167]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,1,4,1
1,7,2,5,1
2,7,4,0,1
3,7,5,1,1
4,7,6,2,1


In [168]:
surgdt_features.shape

(42740, 4)

- dropping `surgdt_DayOfMonth` since we recoded by binning to create `surgdt_PartOfMonth`

In [169]:
surgdt_features = surgdt_features.drop('surgdt_DayOfMonth', axis=1)

In [170]:
surgdt_features.shape

(42740, 3)

#### Creating `surgdt_features_sklearn` 
- `surgdt_features` is currently in a format that can be used by `sklearn` in `Decision Trees`
- making a copy for use by `sklearn` `Decision Trees`

In [171]:
surgdt_features_sklearn = surgdt_features.copy()

In [172]:
surgdt_features_sklearn.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,4,1
1,7,5,1
2,7,0,1
3,7,1,1
4,7,2,1


In [173]:
surgdt_features_sklearn.shape

(42740, 3)

#### Now recoding `surgdt_features` in preparation to use `pd.get_dummies()`

In [174]:
surgdt_features.head(2)

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,4,1
1,7,5,1


In [175]:
surgdt_features.shape

(42740, 3)

In [176]:
weekday_dict = {0: "Mon",
                1: "Tues",
                2: "Wed",
                3: "Thurs",
                4: "Fri",
                5: "Sat",
                6: "Sun"}

- going to `dummy` code `DayOfWeek`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [177]:
surgdt_features = surgdt_features.replace({'surgdt_DayOfWeek': weekday_dict})

In [178]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,Fri,1
1,7,Sat,1
2,7,Mon,1
3,7,Tues,1
4,7,Wed,1


In [179]:
sorted(surgdt_features['surgdt_month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [180]:
month_dict = {1: "Jan",
              2: "Feb",
              3: "Mar",
              4: "Apr",
              5: "May",
              6: "Jun",
              7: "Jul",
              8: "Aug",
              9: "Sep",
              10: "Oct",
              11: "Nov",
              12: "Dec"}

- going to `dummy` code `surgdt_month`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [181]:
surgdt_features = surgdt_features.replace({'surgdt_month': month_dict})

In [182]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,Jul,Fri,1
1,Jul,Sat,1
2,Jul,Mon,1
3,Jul,Tues,1
4,Jul,Wed,1


In [183]:
print (surgdt_features['surgdt_DayOfWeek'].unique())
print (surgdt_features['surgdt_month'].unique())

['Fri' 'Sat' 'Mon' 'Tues' 'Wed' 'Thurs' 'Sun']
['Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec' 'Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun']


- recoding `surgdt_PartOfMonth` back to text for `pd.get_dummies()`

In [184]:
surgdt_features = surgdt_features.replace({'surgdt_PartOfMonth': {1: 'Beg',
                                                                  2: 'Mid',
                                                                  3: 'End'}})

In [185]:
print (surgdt_features['surgdt_PartOfMonth'].unique())

['Beg' 'Mid' 'End']


In [186]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,Jul,Fri,Beg
1,Jul,Sat,Beg
2,Jul,Mon,Beg
3,Jul,Tues,Beg
4,Jul,Wed,Beg


In [187]:
surgdt_features.shape

(42740, 3)

#### Applying `pd.get_dummies` to `surgdt_features`

In [188]:
surgdt_dummies = pd.get_dummies(surgdt_features.copy())

In [189]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Jun,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,surgdt_PartOfMonth_Mid
0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0


In [190]:
surgdt_dummies.shape

(42740, 22)

- need to drop reference classes for `surgdt_month`, `surgdt_DayOfWeek` and `surgdt_PartOfMonth`
- can pick any month, day of week or Part of Month as the reference class
- chose to pick the middle month (June), day of week (Wed) and Part of Month (Mid) given a working hypothesis that most action is around the beginning and end of time periods

In [191]:
drop_cols = ['surgdt_month_Jun', 'surgdt_DayOfWeek_Wed', 'surgdt_PartOfMonth_Mid']

In [192]:
len(drop_cols)

3

In [193]:
surgdt_dummies = surgdt_dummies.drop(drop_cols, axis=1)

In [194]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [195]:
surgdt_dummies.shape

(42740, 19)

- reordering columns for readability

In [196]:
new_col_order = ['surgdt_month_Jan',
                 'surgdt_month_Feb',
                 'surgdt_month_Mar',
                 'surgdt_month_Apr',
                 'surgdt_month_May',
                 'surgdt_month_Jul',
                 'surgdt_month_Aug',
                 'surgdt_month_Sep',
                 'surgdt_month_Oct',
                 'surgdt_month_Nov',
                 'surgdt_month_Dec',

                 'surgdt_DayOfWeek_Mon',
                 'surgdt_DayOfWeek_Tues',
                 'surgdt_DayOfWeek_Thurs',
                 'surgdt_DayOfWeek_Fri',
                 'surgdt_DayOfWeek_Sat',
                 'surgdt_DayOfWeek_Sun',

                 'surgdt_PartOfMonth_Beg',
                 'surgdt_PartOfMonth_End']

In [197]:
surgdt_dummies = surgdt_dummies[new_col_order]

In [198]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [199]:
surgdt_dummies.shape

(42740, 19)

### Numerical Features
- creating `numerical_feature_df`

In [200]:
numerical_features_df = pre_op_data.copy()[num_features]

In [201]:
numerical_features_df.head()

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,45.0,1.2,,,,55.0,40.0
2,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0
3,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0
4,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0


In [202]:
numerical_features_df.shape

(42740, 10)

- creating `bmi` numerical feature
- BMI is weight in kilograms (`weightkg`) divided by height in meters squared `(heightcm/100)^2)`

In [203]:
numerical_features_df['bmi'] = numerical_features_df['weightkg'] / np.power((numerical_features_df['heightcm']/100), 
                                                                            2)

In [204]:
numerical_features_df.head()

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,bmi
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,36.11111
1,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,25.83787
2,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,38.61754
3,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,49.80469
4,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,25.0


In [205]:
numerical_features_df.shape

(42740, 11)

- reordering columns for readability

In [206]:
new_col_order = ['age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys']

In [207]:
numerical_features_df = numerical_features_df[new_col_order]

In [208]:
numerical_features_df.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0


In [209]:
numerical_features_df.shape

(42740, 11)

In [210]:
numerical_features_df.shape

(42740, 11)

### `outcome_other`
- creating `outcome_other_df`

In [211]:
outcome_other_df = pre_op_data.copy()[outcome_other]

In [212]:
outcome_other_df.head()

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin
0,1,0.014,2,2,2,0
1,2,0.017,2,2,1,0
2,3,0.045,2,2,1,0
3,4,0.013,2,2,1,0
4,5,0.016,2,2,2,0


- creating `strokeBin2` which is a more inclusive definition of stroke that incorporates `cnstrokttia`

In [213]:
outcome_other_df['cnstrokttia'].unique()

array([2, 1])

- checking for `NaN`s

In [214]:
outcome_other_df['cnstrokttia'].isnull().sum()

0

- recoding `cnstrokttia`, where `1 == 1` and `2 == 0`

In [215]:
outcome_other_df['cnstrokttia'] = outcome_other_df['cnstrokttia'].replace({1: 1, 2: 0})

- validating

In [216]:
outcome_other_df.head()

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin
0,1,0.014,2,0,2,0
1,2,0.017,2,0,1,0
2,3,0.045,2,0,1,0
3,4,0.013,2,0,1,0
4,5,0.016,2,0,2,0


In [217]:
outcome_other_df['cnstrokttia'].unique()

array([0, 1])

In [218]:
outcome_other_df['cnstrokttia'].isnull().sum()

0

- doing some analysis on `strokeBin` and `cnstrokttia`

- stroke incidence rate

In [219]:
outcome_other_df['strokeBin'].sum()

617

In [220]:
outcome_other_df['strokeBin'].sum() / outcome_other_df.shape[0] * 100

1.4436125409452505

- ttia incidence rate

In [221]:
outcome_other_df['cnstrokttia'].sum()

116

In [222]:
outcome_other_df['cnstrokttia'].sum() / outcome_other_df.shape[0] * 100

0.271408516612073

#### How Often Do `strokeBin` and `cnstrokttia` Overlap?

In [223]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 1) & (outcome_other_df['strokeBin'] == 1)].shape

(5, 6)

In [224]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 0) & (outcome_other_df['strokeBin'] == 0)].shape

(42012, 6)

In [225]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 1) & (outcome_other_df['strokeBin'] == 0)].shape

(111, 6)

In [226]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 0) & (outcome_other_df['strokeBin'] == 1)].shape

(612, 6)

- `strokeBin2`

In [227]:
617 + 111

728

In [228]:
outcome_other_df['strokeBin2'] = outcome_other_df['strokeBin'] + outcome_other_df['cnstrokttia']

In [229]:
outcome_other_df['strokeBin2'].unique()

array([0, 1, 2])

In [230]:
outcome_other_df['strokeBin2'].value_counts()

0    42012
1      723
2        5
Name: strokeBin2, dtype: int64

- recoding cases where `strokeBin` and `cnstrokttia` are both equal to `1`

In [231]:
outcome_other_df['strokeBin2'] = outcome_other_df['strokeBin2'].replace({0: 0,
                                                                         1: 1, 
                                                                         2: 1})

- validating

In [232]:
outcome_other_df['strokeBin2'].unique()

array([0, 1])

In [233]:
outcome_other_df['strokeBin2'].value_counts()

0    42012
1      728
Name: strokeBin2, dtype: int64

- `strokeBin2` incidence rate

In [234]:
outcome_other_df['strokeBin2'].sum() / outcome_other_df.shape[0] * 100

1.7033224145999064

- final validation of `outcome_other_df`

In [235]:
outcome_other_df.head()

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,1,0.014,2,0,2,0,0
1,2,0.017,2,0,1,0,0
2,3,0.045,2,0,1,0,0
3,4,0.013,2,0,1,0,0
4,5,0.016,2,0,2,0,0


In [236]:
outcome_other_df.shape

(42740, 7)

## Assembling the Pre-Op Data Set
- `numerical_features_df`

In [237]:
numerical_features_df.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0


In [238]:
numerical_features_df.shape

(42740, 11)

- `surgdt_features_df`

In [239]:
surgdt_dummies.head(2)

Unnamed: 0,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0


In [240]:
surgdt_dummies.shape

(42740, 19)

- `yes_no_unc_df`

In [241]:
yes_no_unc_df = pre_op_data.copy()[yes_no_unc]

In [242]:
yes_no_unc_df.head(2)

Unnamed: 0,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [243]:
print (yes_no_unc_df.shape)
print (len(yes_no_unc))

(42740, 36)
36


- `compress_to_two_df`

In [244]:
compress_to_two_df = pre_op_data.copy()[compress_to_two]

In [245]:
compress_to_two_df.head(2)

Unnamed: 0,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock,resusc,medasa,medaplt5days,medlipid
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0


In [246]:
compress_to_two_df.shape

(42740, 12)

- need to rename `carshock` and `resusc` to `carshock24` and `resusc24`

In [247]:
compress_to_two_df = compress_to_two_df.rename(columns={'carshock': 'carshock24',
                                                        'resusc': 'resusc24'})

In [248]:
compress_to_two_df.head(2)

Unnamed: 0,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0


In [249]:
compress_to_two_df.shape

(42740, 12)

- `recode_D_Dummies`

In [250]:
recode_D_Dummies.head(2)

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE
0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


In [251]:
recode_D_Dummies.shape

(42740, 25)

- `recode_D_P_Dummies`

In [252]:
recode_D_P_Dummies.head(2)

Unnamed: 0,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0


In [253]:
recode_D_P_Dummies.shape

(42740, 7)

- `outcome_other_df`

In [254]:
outcome_other_df.head(2)

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,1,0.014,2,0,2,0,0
1,2,0.017,2,0,1,0,0


In [255]:
outcome_other_df.shape

(42740, 7)

## Concatenating Pre-Op Data Set Components
### - Dataset with Dummies where reference class is dropped

In [256]:
PREOP_dataset = pd.concat((numerical_features_df,
                           surgdt_dummies,
                           yes_no_unc_df,
                           compress_to_two_df,
                           recode_D_Dummies,
                           recode_D_P_Dummies,
                           outcome_other_df),
                           axis=1)

In [257]:
PREOP_dataset.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.014,2,0,2,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0.017,2,0,1,0,0


#### Versus 10-24 Dataset Added `+1` Net New Columns (10/24 had `116` columns)

In [258]:
PREOP_dataset.shape

(42740, 117)

- validating

In [259]:
print (numerical_features_df.shape, 
       surgdt_dummies.shape,
       yes_no_unc_df.shape,
       compress_to_two_df.shape,
       recode_D_Dummies.shape,
       recode_D_P_Dummies.shape,
       outcome_other_df.shape)

(42740, 11) (42740, 19) (42740, 36) (42740, 12) (42740, 25) (42740, 7) (42740, 7)


In [260]:
numerical_features_df.shape[1] + surgdt_dummies.shape[1] + yes_no_unc_df.shape[1] + compress_to_two_df.shape[1] + recode_D_Dummies.shape[1] + recode_D_P_Dummies.shape[1] + outcome_other_df.shape[1]

117

### - `sklearn` `Decision Trees` Dataset

In [261]:
PREOP_dataset_sklearn = pd.concat((numerical_features_df,
                           surgdt_features_sklearn,
                           yes_no_unc_df,
                           compress_to_two_df,
                           recode_D_sklearn,
                           recode_D_P_sklearn,
                           outcome_other_df),
                           axis=1)

In [262]:
PREOP_dataset_sklearn.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0,1,0.014,2,0,2,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0,2,0.017,2,0,1,0,0


#### Versus 10-24 TREE Dataset Net `-1` Columns (10/24 had `80` columns)

In [263]:
PREOP_dataset_sklearn.shape

(42740, 79)

- validating

In [264]:
print (numerical_features_df.shape, 
       surgdt_features_sklearn.shape,
       yes_no_unc_df.shape,
       compress_to_two_df.shape,
       recode_D_sklearn.shape,
       recode_D_P_sklearn.shape,
       outcome_other_df.shape)

(42740, 11) (42740, 3) (42740, 36) (42740, 12) (42740, 7) (42740, 3) (42740, 7)


In [265]:
numerical_features_df.shape[1] + surgdt_features_sklearn.shape[1] + yes_no_unc_df.shape[1] + compress_to_two_df.shape[1] + recode_D_sklearn.shape[1] + recode_D_P_sklearn.shape[1] + outcome_other_df.shape[1]

79

### Pickling Final Files

In [266]:
#PREOP_dataset.to_pickle('PREOP_dataset_10_27.pkl')

#### For `sklearn` `DecisionTrees`

In [267]:
#PREOP_dataset_sklearn.to_pickle('PREOP_dataset_TREE_10_27.pkl')

## Post-Op Features

#### Going to Eliminate the Pre-Op Variables from the Total Variable Set

In [268]:
len(cat_features)

66

In [269]:
o_cat = ['gender',
         'racecaucasian',
         'raceblack',
         'raceasian',
         'racenativeam',
         'racnativepacific',
         'ethnicity',
         'diabetes',
         'diabctrl',
         'dyslip',
         'dialysis',
         'hypertn',
         'infendo',
         'infendty',
         'TobaccoUse',
         'chrlungd',
         'hmo2',
         'slpapn',
         'ivdrugab',
         'alcohol',
         'liverdis',
         'immsupp',
         'mediastrad',
         'cancer',
         'pvd',
         'ThAoDisease',
         'syncope',
         'unrespstat',
         'cvd',
         'cva',
         'cvawhen',
         'cvdtia',
         'cvdcarsten',
         'cvdstenrt',
         'cvdstenlft',
         'cvdpcarsurg',
         'hitanti',
         'cigsmoker',
         'cigsmokercurr',
         'prcvint',
         'prcab',
         'prvalve',
         'CardSympTimeOfAdm',
         'CardSympTimeOfSurg',
         'anginalclass',
         'chf',
         'classnyh',
         'priorhf',
         'carshock',
         'resusc',
         'Arrhythmia',
         'ArrhythAFlutter',
         'ArrhythAFib',
         'ArrhythAFibDur',
         'arrhythwhen',
         'arrhyafib',
         'medasa',
         'medaplt5days',
         'medinotr',
         'medlipid',
         'numdisv',
         'hdefd',
         'vdaort',
         'vdstena',
         'vdinsufm',
         'vdstenm',
         'vdinsuft',
         'incidenc',
         'status']

In [270]:
len(o_cat)

69

In [271]:
o_num_date = ['age',
              'heightcm',
              'weightkg',
              'hct',
              'creatlst',
              'totalbumin',
              'a1clvl',
              'meldscr',
              'hdef',
              'pasys',
              'surgdt']

In [272]:
len(o_num_date)

11

In [273]:
o_outcome = ['recordId', # keeping for now for auditing purposes
             'predstro', # STS predicted probability of stroke
             'cnstrokp',
             'cnstrokttia',
             'cnstroktrind', # FOUND TO BE ALL `NaN`
             'cncomaenceph',
             'strokeBin'] # adding strokeBin to compare STS model prediction to actual outcome

In [274]:
len(o_outcome)

7

In [275]:
orig_pre_op_vars = o_num_date + o_cat + o_outcome

In [276]:
len(orig_pre_op_vars)

87

#### Getting Subset of Non-Pre-Op Variables

In [277]:
raw_data.shape

(42746, 409)

In [278]:
raw_data_cols = raw_data.columns.tolist()

In [279]:
len(raw_data_cols)

409

In [280]:
non_pre_op_vars = list(np.setdiff1d(raw_data.columns.tolist(),
                                    orig_pre_op_vars,
                                    assume_unique=True))

In [281]:
len(non_pre_op_vars)

322

In [282]:
working_data.shape

(42740, 409)

#### Creating `non_pre_op` `DataFrame`

In [283]:
non_pre_op = working_data.copy()[non_pre_op_vars]

In [284]:
non_pre_op.head()

Unnamed: 0,raceother,dischdt,ChrLungDType,bdtx,RFHemoglobin,inr,cvdcoma,cvdrind,cvdninvas,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,ArrhythPPaced,ArrhythVV,ArrhythAtrFib,arrhyafibty,medacei48,medgp,medgpmn,medacoag,medacmn,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medster,PctStenLMain,SyntaxScrKnown,SyntaxScr,DimAvail,lvsd,lvedd,pasysmeas,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,CNStrokT,drgnum,BldRBC,predrenf
0,2.0,2011-07-06,,2.0,,1.0,,,,,,,,,,,,,,,,,,,,2.0,2,,2.0,,2.0,,,,,,,,,,2.0,2.0,,,,,45.0,50.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,0.048
1,2.0,2011-07-09,,2.0,,1.0,,,,,,,,,,,,,,,2.0,,,,,2.0,2,,1.0,2.0,2.0,,,,,,,,,,1.0,2.0,,,,,44.0,32.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,10.0,,,,,,,10.0,,,,,,,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-02,2011-07-03,,,,,32.0,,29.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,4.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,3.0,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.1,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,0.069
2,2.0,2011-07-12,,2.0,,1.0,,,,,,,,,,,,,,,2.0,,,,,1.0,2,,2.0,,2.0,,,,,,,,,,2.0,2.0,,,,,31.0,50.0,1.0,,1.5,16.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-04,2011-07-04,,,,,,,,,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,2.0,1.0,,2.0,2.0,,,,,1.4,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,0.148
3,2.0,2011-07-09,,2.0,,1.0,,,,,,,,,,,,,,,,,,,,1.0,2,,2.0,,2.0,,,,,,,,,,2.0,2.0,,,,,33.0,51.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-05,2011-07-05,,,,,34.8,,19.0,,3,,,2.0,1.0,2.0,,2.0,73.0,2.0,,,,,,2.0,47.0,3.0,1.0,2.0,2.0,2.0,,,,,1.0,2.0,0.0,0.0,0.0,,,2.0,2.0,1.0,1.0,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,0.074
4,2.0,2011-07-10,,2.0,,1.0,,,,,,,,,,,,,,,,,,,,2.0,2,,1.0,1.0,2.0,,,,,,,,,,2.0,2.0,,,,,21.0,40.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,34.6,,19.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,,1.0,1.0,0.0,0.0,0.0,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,0.8,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,0.019


In [285]:
non_pre_op.shape

(42740, 322)

### Getting Post-Op Features per FS Data Dictionary (11/2)

In [286]:
sorted(non_pre_op_vars)

['ADEt1',
 'ADEt2',
 'ADEt3',
 'ADLesTAneur',
 'ADLesTCoarcNar',
 'ADLesTDis',
 'ADLesTDisTmg',
 'ADLesTDisTy',
 'ADLesTIntraHema',
 'ADLesTPenUlcer',
 'ADLesTPseudo',
 'ADLesTRup',
 'ADLocArch',
 'ADLocAsc',
 'ADLocDesThor',
 'ADLocRoot',
 'ADLocThora',
 'ADPres',
 'AoHemoDatAvail',
 'AortProcAsc',
 'AortProcDesProx',
 'AortProcHemi',
 'AortProcRoot',
 'AortProcTotArch',
 'ArrhythAtrFib',
 'ArrhythPPaced',
 'ArrhythVV',
 'AsmtAoDxMeth',
 'BldRBC',
 'CABHybrPCI',
 'CAortReint',
 'CAortReintTy',
 'CNEnceph',
 'CNParesisTy',
 'CNStrokT',
 'COtLiver',
 'CReintMI',
 'CReintMIIntTy',
 'CReintMIVes',
 'CVaAoDisTy',
 'CanArtStInn',
 'CathBasAssistTy',
 'ChrLungDType',
 'CombCardPCI',
 'CombProcs',
 'CombProcsPCI',
 'CombProcsStatus',
 'CombProcsStentTy',
 'CompMAD',
 'CompMAD1',
 'CompMAD2',
 'CompMAD3',
 'DCFactorXa',
 'DCNovOrAnti',
 'DCOthAnticoag',
 'DCOthAntiplat',
 'DialStat',
 'DimAvail',
 'EndovasTAVR',
 'GenAnes',
 'HighIntraGlu',
 'IABP',
 'IntraClotFact',
 'IntraopProComCon',
 'Int

#### `Categorical` Post-Op Features
- `post_op_cat`

In [287]:
post_op_cat = ['aortoccl',
               'asmtaodx',
               'asmtascaa',
               'CABHybrPCI', # probably apply to 2.81 only since capitalized
               'canartstaort',
               'canartstfem',
               'canartstoth',
               'canartstax',
               'cathbasassist',
               'cathbasassistind',
               'cathbasassistwhen',
               'ceroxused',
               'circarr',
               'cofirstind',
               'concalc',
               'cotafib',
               'cotarrst',
               'cpbutil',
               'cperftyp',
               'cperfutil',
               'ecmo',
               'ecmoind', # end of page 1
               'ecmowhen',
               'emergrsn',
               'IABP', # capitalized so probably only 2.81
               'iabpind',
               'iabpwhen',
               'ibldprod',
               'imedeaca',
               'imedtran',
               'inoptee',
               'mt30stat',
               'mtcause',
               'mtopd',
               'ocarasd',
               'OCarASDPFO', # capitalized so probably only 2.81
               'OCarASDSec', # capitalized so probably only 2.81
               'ocarlva',
               'ocarsvr',
               'ocarvsd',
               'opcab',
               'opocard', # end of page 2
               'oponcard', # next on list 'opticus' not found
               'opvalve',
               'Readmit', # probably apply to 2.81 only since capitalized
               'readmrsn',
               'unplao',
               'unplav', # next on list 'unplcabg' not found
               'unplmv',
               'unplproc',
               'unplvad',
               'urgntrsn',
               'vadproc',
               'valexp2', # in data dictionary was 'valexp' -- assuming this is what FS meant
               'valexppos2', # in data dictionary was 'valexppos' -- assuming this is what FS meant

                # no 'vsav', but there was a 'VSAVRoot' -- but would only apply to 2.81

               'vsavpr',
               'vsmv',
               'vsmvpr',
               'vstcv', # end of page 3
               'vstcvr',
               'VSTV'] # probably apply to 2.81 only since capitalized



In [288]:
len(post_op_cat)

61

#### `Numerical` and `Datetime` Post-Op Features
- `post_op_num_date`

In [289]:
post_op_num_date = ['cperftime',
                    'cumulsatlft',
                    'cumulsatrt',
                    'dhcatm',
                    'ibdcryou',
                    'ibdffpu',
                    'ibdplatu',
                    'ibdrbcu', # listed in data dictionary as 'ibdrbci' typo???
                    'lwsthct',
                    'lwsttemp',
                    'perfustm',
                    'postcreat',
                    'prerso2lft',
                    'prerso2rt',
                    'xclamptm',
                   
                    'mtdate',    # datetime
                    'orexitdt',  # datetime
                    'orentrydt'] # datetime

In [290]:
len(post_op_num_date)

18

#### Checking if `capitalized` features are all `NaN` in `2.73`

In [291]:
cap_vars = ['CABHybrPCI',
            'IABP',
            'OCarASDPFO',
            'OCarASDSec',
            'Readmit',
            'VSTV']

In [292]:
len(cap_vars)

6

In [293]:
split_dates = working_data.copy()

In [294]:
split_dates.shape

(42740, 409)

In [295]:
v273_df = split_dates[split_dates['surgdt'] < '2014-01-01']

In [296]:
v273_df.shape

(19756, 409)

#### Looking at the number of `NaN`s per `capitalized` `post-op` feature in 2.73

In [297]:
col_names273 = []
num_nulls273 = []

for column in cap_vars:
    col_names273.append(column)
    num_nulls273.append(v273_df[column].isnull().sum())

In [298]:
print(len(col_names273))
print(len(num_nulls273))

6
6


In [299]:
# creating a new data frame for easier analysis

nulls273_df = pd.DataFrame(list(zip(col_names273, num_nulls273)),
                           columns = ['feature', 'number_of_nulls']).sort_values(by=['number_of_nulls'],
                                                                                 ascending=False)

In [300]:
nulls273_df

Unnamed: 0,feature,number_of_nulls
2,OCarASDPFO,19756
3,OCarASDSec,19756
4,Readmit,19756
5,VSTV,19756
0,CABHybrPCI,3987
1,IABP,0


In [301]:
working_data['CABHybrPCI'].unique()

array([ 2.,  1., nan])

In [302]:
working_data['IABP'].unique()

array([ 2.,  1., nan])

#### Looking at `2.81`

In [303]:
v281_df = split_dates[split_dates['surgdt'] >= '2014-01-01']

In [304]:
v281_df.shape

(22984, 409)

In [305]:
col_names281 = []
num_nulls281 = []

for column in cap_vars:
    col_names281.append(column)
    num_nulls281.append(v281_df[column].isnull().sum())

In [306]:
print(len(col_names281))
print(len(num_nulls281))

6
6


In [307]:
# creating a new data frame for easier analysis

nulls281_df = pd.DataFrame(list(zip(col_names281, num_nulls281)),
                           columns = ['feature', 'number_of_nulls']).sort_values(by=['number_of_nulls'],
                                                                                 ascending=False)

In [308]:
nulls281_df

Unnamed: 0,feature,number_of_nulls
2,OCarASDPFO,20799
3,OCarASDSec,20799
0,CABHybrPCI,19935
5,VSTV,16867
4,Readmit,4208
1,IABP,5


### NEXT STEPS
- EDA on numeric variables - get a reality check
- Fill `NaN`s with `0`s on `cumulsatlft` and `cumulsatrt`
- Recode `categorical` post-op features using data dictionary
- Create Post-Op Feature Matrix
- Create Post-Op Feature Matrix with `outcome_other_df` using `pd.concat` code on line 256 above
- Create combined `PRE` + `POST` Dataset and put columns in order

## Encoding Post-Op `categorical` features

In [309]:
working_data.shape

(42740, 409)

In [310]:
post_op_cat_features = working_data.copy()[post_op_cat]

In [311]:
post_op_cat_features.head(2)

Unnamed: 0,aortoccl,asmtaodx,asmtascaa,CABHybrPCI,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,cathbasassistind,cathbasassistwhen,ceroxused,circarr,cofirstind,concalc,cotafib,cotarrst,cpbutil,cperftyp,cperfutil,ecmo,ecmoind,ecmowhen,emergrsn,IABP,iabpind,iabpwhen,ibldprod,imedeaca,imedtran,inoptee,mt30stat,mtcause,mtopd,ocarasd,OCarASDPFO,OCarASDSec,ocarlva,ocarsvr,ocarvsd,opcab,opocard,oponcard,opvalve,Readmit,readmrsn,unplao,unplav,unplmv,unplproc,unplvad,urgntrsn,vadproc,valexp2,valexppos2,vsavpr,vsmv,vsmvpr,vstcv,vstcvr,VSTV
0,2.0,6.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,,,2.0,2.0,,2.0,2,2,3,,,2.0,,,,2.0,,,2.0,2.0,2.0,2.0,1.0,,,,,,,,,1,2.0,2.0,1,,,,,,1.0,,,1.0,2.0,,,1.0,1.0,,,
1,2.0,,2.0,1.0,1.0,2.0,2.0,2.0,2.0,,,2.0,2.0,,2.0,1,2,3,,,2.0,,,4.0,1.0,3.0,1.0,2.0,2.0,2.0,2.0,1.0,,,,,,,,,1,2.0,2.0,2,,,,,,1.0,,,1.0,,,,,,,,


In [312]:
post_op_cat_features.shape

(42740, 61)

- dropping columns that only apply to `2.81`

In [313]:
cols_to_drop = ['CABHybrPCI',
                'OCarASDPFO',
                'OCarASDSec',
                'Readmit',
                'VSTV']

In [314]:
len(cols_to_drop)

5

In [315]:
post_op_cat_features = post_op_cat_features.drop(cols_to_drop,
                                                 axis=1)

In [316]:
post_op_cat_features.shape

(42740, 56)

#### Checking the Original Encodings

In [317]:
feature_name = []
orig_coding = []
num_levels = []
dtypes = []

for column in post_op_cat_features.columns.tolist():
    feature_name.append(column)
    orig_coding.append(sorted(post_op_cat_features[column].unique()))
    num_levels.append(len(post_op_cat_features[column].unique()))
    dtypes.append(post_op_cat_features[column].dtype)

In [318]:
post_op_cat_diagnostics_df = pd.DataFrame(list(zip(feature_name, orig_coding, num_levels, dtypes)),
                                          columns = ['feature', 'original_levels', 'num_levels', 'dtype'])

In [319]:
post_op_cat_diagnostics_df

Unnamed: 0,feature,original_levels,num_levels,dtype
0,aortoccl,"[2.0, 3.0, 5.0, 6.0, nan]",5,float64
1,asmtaodx,"[6.0, nan, 1.0, 2.0, 3.0, 4.0, 5.0]",7,float64
2,asmtascaa,"[1.0, 2.0, nan, 3.0]",4,float64
3,canartstaort,"[1.0, nan, 2.0]",3,float64
4,canartstfem,"[2.0, nan, 1.0]",3,float64
5,canartstoth,"[2.0, nan, 1.0]",3,float64
6,canartstax,"[2.0, nan, 1.0]",3,float64
7,cathbasassist,"[2.0, nan, 1.0]",3,float64
8,cathbasassistind,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0]",6,float64
9,cathbasassistwhen,"[nan, 1.0, 2.0, 3.0]",4,float64


- `vstcvr` is all `NaN` for this dataset
- `vstc` is only composed of `NaN`s and `2`s - which means from the data dictionary all of the observations would be coded as `0` or `NO`

#### Let's examine original post-op `categorical` feature encoding by `num_levels`

- `num_levels == 1`

In [320]:
post_op_cat_diagnostics_df[post_op_cat_diagnostics_df['num_levels'] == 1]

Unnamed: 0,feature,original_levels,num_levels,dtype
55,vstcvr,[nan],1,float64


- `vstcvr` is all `NaN` for this dataset

- `num_levels == 2`

In [321]:
post_op_cat_diagnostics_df[post_op_cat_diagnostics_df['num_levels'] == 2]

Unnamed: 0,feature,original_levels,num_levels,dtype
14,cotafib,"[1, 2]",2,int64
15,cotarrst,"[1, 2]",2,int64
34,ocarlva,"[nan, 2.0]",2,float64
35,ocarsvr,"[nan, 2.0]",2,float64
40,opvalve,"[1, 2]",2,int64
50,valexppos2,"[nan, 1.0]",2,float64
54,vstcv,"[nan, 2.0]",2,float64


- `ocarlva`, `ocarsvr` and `vstcv` are only composed of `NaN`s and `2`s - which means from the data dictionary all of the observations would be coded as `0` or `NO`

- `num_levels == 3`

In [322]:
post_op_cat_diagnostics_df[post_op_cat_diagnostics_df['num_levels'] == 3]

Unnamed: 0,feature,original_levels,num_levels,dtype
3,canartstaort,"[1.0, nan, 2.0]",3,float64
4,canartstfem,"[2.0, nan, 1.0]",3,float64
5,canartstoth,"[2.0, nan, 1.0]",3,float64
6,canartstax,"[2.0, nan, 1.0]",3,float64
7,cathbasassist,"[2.0, nan, 1.0]",3,float64
10,ceroxused,"[1.0, 2.0, nan]",3,float64
11,circarr,"[1.0, 2.0, nan]",3,float64
12,cofirstind,"[nan, 1.0, 2.0]",3,float64
13,concalc,"[1.0, 2.0, nan]",3,float64
16,cpbutil,"[1, 2, 3]",3,int64


In [323]:
post_op_cat_diagnostics_df[post_op_cat_diagnostics_df['num_levels'] == 3].shape

(27, 4)

- with the exception of `cpbutil` we can bulk code the rest

In [324]:
one_two_df = post_op_cat_diagnostics_df[(post_op_cat_diagnostics_df['num_levels'] == 3) & 
                                        (post_op_cat_diagnostics_df['feature'] != 'cpbutil')]

In [325]:
one_two_df

Unnamed: 0,feature,original_levels,num_levels,dtype
3,canartstaort,"[1.0, nan, 2.0]",3,float64
4,canartstfem,"[2.0, nan, 1.0]",3,float64
5,canartstoth,"[2.0, nan, 1.0]",3,float64
6,canartstax,"[2.0, nan, 1.0]",3,float64
7,cathbasassist,"[2.0, nan, 1.0]",3,float64
10,ceroxused,"[1.0, 2.0, nan]",3,float64
11,circarr,"[1.0, 2.0, nan]",3,float64
12,cofirstind,"[nan, 1.0, 2.0]",3,float64
13,concalc,"[1.0, 2.0, nan]",3,float64
18,cperfutil,"[nan, 1.0, 2.0]",3,float64


In [326]:
one_two_df.shape

(26, 4)

- extracting bulk code feature names

In [327]:
one_two_nan = list(one_two_df['feature'].values)

In [328]:
len(one_two_nan), type(one_two_nan)

(26, list)

In [329]:
one_two_nan

['canartstaort',
 'canartstfem',
 'canartstoth',
 'canartstax',
 'cathbasassist',
 'ceroxused',
 'circarr',
 'cofirstind',
 'concalc',
 'cperfutil',
 'IABP',
 'ibldprod',
 'imedeaca',
 'imedtran',
 'inoptee',
 'mtopd',
 'ocarasd',
 'ocarvsd',
 'opocard',
 'oponcard',
 'unplao',
 'unplav',
 'unplmv',
 'unplvad',
 'valexp2',
 'vsmvpr']

### Creating `post_op_cat_bulk`
- bulk encoded features

In [330]:
post_op_cat_features.shape

(42740, 56)

In [331]:
post_op_cat_bulk = post_op_cat_features.copy()[one_two_nan]

In [332]:
post_op_cat_bulk.head(2)

Unnamed: 0,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr
0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,,2.0,2.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,2.0,1.0
1,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,,1.0,2.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,


In [333]:
post_op_cat_bulk.shape

(42740, 26)

#### Bulk Encoding

In [334]:
orig_post_op_cat_bulk = post_op_cat_bulk.copy()

In [335]:
for column in one_two_nan:
    post_op_cat_bulk[column] = post_op_cat_bulk[column].replace({1: 1,
                                                                 2: 0,
                                                                 np.nan: 0})

In [336]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in one_two_nan:
    feature_name.append(column)
    orig_coding.append(sorted(orig_post_op_cat_bulk[column].unique()))
    new_coding.append(sorted(post_op_cat_bulk[column].unique()))
    dtypes.append(post_op_cat_bulk[column].dtype)

In [337]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'data_type'])

Unnamed: 0,feature,original_levels,new_levels,data_type
0,canartstaort,"[1.0, nan, 2.0]","[0.0, 1.0]",float64
1,canartstfem,"[2.0, nan, 1.0]","[0.0, 1.0]",float64
2,canartstoth,"[2.0, nan, 1.0]","[0.0, 1.0]",float64
3,canartstax,"[2.0, nan, 1.0]","[0.0, 1.0]",float64
4,cathbasassist,"[2.0, nan, 1.0]","[0.0, 1.0]",float64
5,ceroxused,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
6,circarr,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
7,cofirstind,"[nan, 1.0, 2.0]","[0.0, 1.0]",float64
8,concalc,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
9,cperfutil,"[nan, 1.0, 2.0]","[0.0, 1.0]",float64


- checking `NaN`s

In [338]:
post_op_cat_bulk.isnull().sum()

canartstaort     0
canartstfem      0
canartstoth      0
canartstax       0
cathbasassist    0
ceroxused        0
circarr          0
cofirstind       0
concalc          0
cperfutil        0
IABP             0
ibldprod         0
imedeaca         0
imedtran         0
inoptee          0
mtopd            0
ocarasd          0
ocarvsd          0
opocard          0
oponcard         0
unplao           0
unplav           0
unplmv           0
unplvad          0
valexp2          0
vsmvpr           0
dtype: int64

### Creating `post_op_cat_manual`
- manually recoding features one at a time
- first step is to get all of the features that require manual recoding

In [339]:
post_op_cat_features.shape, post_op_cat_bulk.shape, post_op_cat_features.shape[1] - post_op_cat_bulk.shape[1]

((42740, 56), (42740, 26), 30)

In [340]:
manual_recodes = list(np.setdiff1d(post_op_cat_features.columns.tolist(), 
                                   one_two_nan,
                                   assume_unique=True))

In [341]:
len(manual_recodes)

30

In [342]:
post_op_cat_manual = post_op_cat_features.copy()[manual_recodes]

In [343]:
post_op_cat_manual.head()

Unnamed: 0,aortoccl,asmtaodx,asmtascaa,cathbasassistind,cathbasassistwhen,cotafib,cotarrst,cpbutil,cperftyp,ecmo,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mt30stat,mtcause,ocarlva,ocarsvr,opcab,opvalve,readmrsn,unplproc,urgntrsn,vadproc,valexppos2,vsavpr,vsmv,vstcv,vstcvr
0,2.0,6.0,1.0,,,2,2,3,,2.0,,,,,,1.0,,,,1,1,,1.0,,1.0,,,1.0,,
1,2.0,,2.0,,,1,2,3,,2.0,,,4.0,3.0,1.0,1.0,,,,1,2,,1.0,,1.0,,,,,
2,5.0,,2.0,,,1,2,1,,2.0,,,,,,1.0,,,,1,2,,1.0,1.0,1.0,,,,,
3,2.0,,2.0,,,2,2,3,,2.0,,,,,,1.0,,,,1,1,,1.0,,1.0,,,1.0,,
4,2.0,,2.0,,,2,2,3,,2.0,,,,,,1.0,,,,1,2,,1.0,1.0,1.0,,,,,


In [344]:
post_op_cat_manual.shape

(42740, 30)

- dropping `vstcvr`, `ocarlva`, `ocarsvr` and `vstcv` as these features did not contain any information given they have all `NaN`s or imply that all the observations are `0`

In [345]:
post_op_cat_manual = post_op_cat_manual.drop(['vstcvr', # all NaN
                                              'ocarlva',
                                              'ocarsvr',
                                              'vstcv'],
                                             axis=1)

In [346]:
post_op_cat_manual.head()

Unnamed: 0,aortoccl,asmtaodx,asmtascaa,cathbasassistind,cathbasassistwhen,cotafib,cotarrst,cpbutil,cperftyp,ecmo,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mt30stat,mtcause,opcab,opvalve,readmrsn,unplproc,urgntrsn,vadproc,valexppos2,vsavpr,vsmv
0,2.0,6.0,1.0,,,2,2,3,,2.0,,,,,,1.0,,1,1,,1.0,,1.0,,,1.0
1,2.0,,2.0,,,1,2,3,,2.0,,,4.0,3.0,1.0,1.0,,1,2,,1.0,,1.0,,,
2,5.0,,2.0,,,1,2,1,,2.0,,,,,,1.0,,1,2,,1.0,1.0,1.0,,,
3,2.0,,2.0,,,2,2,3,,2.0,,,,,,1.0,,1,1,,1.0,,1.0,,,1.0
4,2.0,,2.0,,,2,2,3,,2.0,,,,,,1.0,,1,2,,1.0,1.0,1.0,,,


In [347]:
post_op_cat_manual.shape

(42740, 26)

- making a copy to validate results later

In [348]:
orig_post_op_cat_manual = post_op_cat_manual.copy()

### Manually Encoding Post-Op Categorical Features
- Code Pattern:
    - check `.unique()`
    - check `df['column'].value_counts()`
    - use `df['column'] = df.replace({'column': {replacement dict}})`
    - re-check `df['column'].value_counts()`
    - when re-code all, create a summary `DataFrame` similar to `post_op_cat_diagnostics_df` above in lines `317-318`
    - filter by `num_levels`
    - those features with `num_levels` > `2` require `dummy` variables to be made
    - split the `post_op_cat_manual` `DataFrame` into `dummy` and `nodummy` portions

- `aortoccl`

In [349]:
post_op_cat_manual['aortoccl'].unique()

array([ 2.,  5.,  3.,  6., nan])

In [350]:
post_op_cat_manual['aortoccl'].value_counts()

2.0    39595
5.0     2988
3.0      114
6.0       42
Name: aortoccl, dtype: int64

In [351]:
post_op_cat_manual['aortoccl'] = post_op_cat_manual['aortoccl'].replace({2: 1,
                                                                         3: 2,
                                                                         5: 0,
                                                                         6: 0,
                                                                         np.nan: 1})

In [352]:
post_op_cat_manual['aortoccl'].value_counts()

1.0    39596
0.0     3030
2.0      114
Name: aortoccl, dtype: int64

In [353]:
post_op_cat_manual['aortoccl'].dtypes

dtype('float64')

In [354]:
post_op_cat_manual.columns.tolist()[1]

'asmtaodx'

- `asmtaodx`

In [355]:
post_op_cat_manual['asmtaodx'].unique()

array([ 6., nan,  1.,  3.,  2.,  5.,  4.])

In [356]:
post_op_cat_manual['asmtaodx'].value_counts()

1.0    15567
2.0     4192
3.0     3714
6.0     2062
4.0      605
5.0      240
Name: asmtaodx, dtype: int64

In [357]:
post_op_cat_manual['asmtaodx'] = post_op_cat_manual['asmtaodx'].replace({1: 1,
                                                                         2: 2,
                                                                         3: 3,
                                                                         4: 4,
                                                                         5: 5,
                                                                         6: 1,
                                                                         np.nan: 0})

In [358]:
post_op_cat_manual['asmtaodx'].value_counts()

1.0    17629
0.0    16360
2.0     4192
3.0     3714
4.0      605
5.0      240
Name: asmtaodx, dtype: int64

In [359]:
post_op_cat_manual['asmtaodx'].dtypes

dtype('float64')

In [360]:
post_op_cat_manual.columns.tolist()[2]

'asmtascaa'

- `asmtascaa`

In [361]:
post_op_cat_manual['asmtascaa'].unique()

array([ 1.,  2., nan,  3.])

In [362]:
post_op_cat_manual['asmtascaa'].value_counts()

1.0    26429
2.0    13212
3.0     2999
Name: asmtascaa, dtype: int64

In [363]:
post_op_cat_manual['asmtascaa'] = post_op_cat_manual['asmtascaa'].replace({1: 1,
                                                                           2: 0,
                                                                           3: 0,
                                                                           np.nan: 0})

In [364]:
post_op_cat_manual['asmtascaa'].value_counts()

1.0    26429
0.0    16311
Name: asmtascaa, dtype: int64

In [365]:
post_op_cat_manual['asmtascaa'].dtypes

dtype('float64')

In [366]:
post_op_cat_manual.columns.tolist()[3]

'cathbasassistind'

- `cathbasassistind`

In [367]:
post_op_cat_manual['cathbasassistind'].unique()

array([nan,  1.,  4.,  2.,  3.,  5.])

In [368]:
post_op_cat_manual['cathbasassistind'].value_counts()

1.0    44
2.0     5
5.0     4
3.0     2
4.0     1
Name: cathbasassistind, dtype: int64

In [369]:
post_op_cat_manual['cathbasassistind'] = post_op_cat_manual['cathbasassistind'].replace({1: 1,
                                                                                         2: 2,
                                                                                         3: 3,
                                                                                         4: 0,
                                                                                         5: 0,
                                                                                         np.nan: 0})

In [370]:
post_op_cat_manual['cathbasassistind'].value_counts()

0.0    42689
1.0       44
2.0        5
3.0        2
Name: cathbasassistind, dtype: int64

In [371]:
post_op_cat_manual['cathbasassistind'].dtypes

dtype('float64')

In [372]:
post_op_cat_manual.columns.tolist()[4]

'cathbasassistwhen'

- `cathbasassistwhen`

In [373]:
post_op_cat_manual['cathbasassistwhen'].unique()

array([nan,  3.,  1.,  2.])

In [374]:
post_op_cat_manual['cathbasassistwhen'].value_counts()

1.0    25
3.0    24
2.0     7
Name: cathbasassistwhen, dtype: int64

In [375]:
post_op_cat_manual['cathbasassistwhen'] = post_op_cat_manual['cathbasassistwhen'].replace({1: 1,
                                                                                           2: 2,
                                                                                           3: 3,
                                                                                           np.nan: 0})

In [376]:
post_op_cat_manual['cathbasassistwhen'].value_counts()

0.0    42684
1.0       25
3.0       24
2.0        7
Name: cathbasassistwhen, dtype: int64

In [377]:
post_op_cat_manual['cathbasassistwhen'].dtypes

dtype('float64')

In [378]:
post_op_cat_manual.columns.tolist()[5]

'cotafib'

- `cotafib`

In [379]:
post_op_cat_manual['cotafib'].unique()

array([2, 1])

In [380]:
post_op_cat_manual['cotafib'].value_counts()

2    30314
1    12426
Name: cotafib, dtype: int64

In [381]:
post_op_cat_manual['cotafib'] = post_op_cat_manual['cotafib'].replace({1: 1,
                                                                       2: 0})

In [382]:
post_op_cat_manual['cotafib'].value_counts()

0    30314
1    12426
Name: cotafib, dtype: int64

In [383]:
post_op_cat_manual['cotafib'].dtypes

dtype('int64')

In [384]:
post_op_cat_manual.columns.tolist()[6]

'cotarrst'

- `cotarrst`

In [385]:
post_op_cat_manual['cotarrst'].unique()

array([2, 1])

In [386]:
post_op_cat_manual['cotarrst'].value_counts()

2    41957
1      783
Name: cotarrst, dtype: int64

In [387]:
post_op_cat_manual['cotarrst'] = post_op_cat_manual['cotarrst'].replace({1: 1,
                                                                         2: 0})

In [388]:
post_op_cat_manual['cotarrst'].value_counts()

0    41957
1      783
Name: cotarrst, dtype: int64

In [389]:
post_op_cat_manual['cotarrst'].dtypes

dtype('int64')

In [390]:
post_op_cat_manual.columns.tolist()[7]

'cpbutil'

- `cpbutil`

In [391]:
post_op_cat_manual['cpbutil'].unique()

array([3, 1, 2])

In [392]:
post_op_cat_manual['cpbutil'].value_counts()

3    40447
1     2223
2       70
Name: cpbutil, dtype: int64

In [393]:
post_op_cat_manual['cpbutil'] = post_op_cat_manual['cpbutil'].replace({1: 0,
                                                                       2: 1,
                                                                       3: 2})

In [394]:
post_op_cat_manual['cpbutil'].value_counts()

2    40447
0     2223
1       70
Name: cpbutil, dtype: int64

In [395]:
post_op_cat_manual['cpbutil'].dtypes

dtype('int64')

In [396]:
post_op_cat_manual.columns.tolist()[8]

'cperftyp'

- `cperftyp`

In [397]:
post_op_cat_manual['cperftyp'].unique()

array([nan,  1.,  3.,  2.])

In [398]:
post_op_cat_manual['cperftyp'].value_counts()

1.0    8
2.0    3
3.0    1
Name: cperftyp, dtype: int64

In [399]:
post_op_cat_manual['cperftyp'] = post_op_cat_manual['cperftyp'].replace({1: 1,
                                                                         2: 2,
                                                                         3: 3,
                                                                         np.nan: 0})

In [400]:
post_op_cat_manual['cperftyp'].value_counts()

0.0    42728
1.0        8
2.0        3
3.0        1
Name: cperftyp, dtype: int64

In [401]:
post_op_cat_manual['cperftyp'].dtypes

dtype('float64')

In [402]:
post_op_cat_manual.columns.tolist()[9]

'ecmo'

- `ecmo`

In [403]:
post_op_cat_manual['ecmo'].unique()

array([ 2.,  1., nan,  4.,  3.])

In [404]:
post_op_cat_manual['ecmo'].value_counts()

2.0    42662
4.0       38
1.0       27
3.0        1
Name: ecmo, dtype: int64

In [405]:
post_op_cat_manual['ecmo'] = post_op_cat_manual['ecmo'].replace({1: 1,
                                                                 2: 0,
                                                                 3: 1,
                                                                 4: 1,
                                                                 np.nan: 0})

In [406]:
post_op_cat_manual['ecmo'].value_counts()

0.0    42674
1.0       66
Name: ecmo, dtype: int64

In [407]:
post_op_cat_manual['ecmo'].dtypes

dtype('float64')

In [408]:
post_op_cat_manual.columns.tolist()[10]

'ecmoind'

- `ecmoind`

In [409]:
post_op_cat_manual['ecmoind'].unique()

array([nan,  1.,  4.,  2.])

In [410]:
post_op_cat_manual['ecmoind'].value_counts()

1.0    40
2.0    14
4.0    12
Name: ecmoind, dtype: int64

In [411]:
post_op_cat_manual['ecmoind'] = post_op_cat_manual['ecmoind'].replace({1: 1,
                                                                       2: 2,
                                                                       4: 4,
                                                                       np.nan: 0})

In [412]:
post_op_cat_manual['ecmoind'].value_counts()

0.0    42674
1.0       40
2.0       14
4.0       12
Name: ecmoind, dtype: int64

In [413]:
post_op_cat_manual['ecmoind'].dtypes

dtype('float64')

In [414]:
post_op_cat_manual.columns.tolist()[11]

'ecmowhen'

- `ecmowhen`

In [415]:
post_op_cat_manual['ecmowhen'].unique()

array([nan,  2.,  3.,  1.])

In [416]:
post_op_cat_manual['ecmowhen'].value_counts()

3.0    31
2.0    30
1.0     5
Name: ecmowhen, dtype: int64

In [417]:
post_op_cat_manual['ecmowhen'] = post_op_cat_manual['ecmowhen'].replace({1: 1,
                                                                         2: 2,
                                                                         3: 3,
                                                                         np.nan: 0})

In [418]:
post_op_cat_manual['ecmowhen'].value_counts()

0.0    42674
3.0       31
2.0       30
1.0        5
Name: ecmowhen, dtype: int64

In [419]:
post_op_cat_manual['ecmowhen'].dtypes

dtype('float64')

In [420]:
post_op_cat_manual.columns.tolist()[12]

'emergrsn'

- `emergrsn`

In [421]:
sorted(list(post_op_cat_manual['emergrsn'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]

In [422]:
post_op_cat_manual['emergrsn'].value_counts()

4.0     173
5.0     111
1.0      73
13.0     68
8.0      64
6.0      26
12.0     13
3.0      10
2.0       7
7.0       1
10.0      1
11.0      1
9.0       1
Name: emergrsn, dtype: int64

In [423]:
post_op_cat_manual['emergrsn'] = post_op_cat_manual['emergrsn'].replace({1: 1,
                                                                         2: 2,
                                                                         3: 3,
                                                                         4: 4,
                                                                         5: 5,
                                                                         6: 6,
                                                                         7: 7,
                                                                         8: 8,
                                                                         9: 9,
                                                                         10: 10,
                                                                         11: 11,
                                                                         12: 12,
                                                                         13: 13,
                                                                         np.nan: 0})

In [424]:
post_op_cat_manual['emergrsn'].value_counts()

0.0     42191
4.0       173
5.0       111
1.0        73
13.0       68
8.0        64
6.0        26
12.0       13
3.0        10
2.0         7
7.0         1
10.0        1
11.0        1
9.0         1
Name: emergrsn, dtype: int64

In [425]:
post_op_cat_manual['emergrsn'].dtypes

dtype('float64')

In [426]:
post_op_cat_manual.columns.tolist()[13]

'iabpind'

- `iabpind`

In [427]:
sorted(list(post_op_cat_manual['iabpind'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]

In [428]:
post_op_cat_manual['iabpind'].value_counts()

1.0    1235
3.0     656
5.0     610
4.0     202
2.0     183
6.0      18
Name: iabpind, dtype: int64

In [429]:
post_op_cat_manual['iabpind'] = post_op_cat_manual['iabpind'].replace({1: 1,
                                                                       2: 2,
                                                                       3: 3,
                                                                       4: 4,
                                                                       5: 5,
                                                                       6: 0,
                                                                       np.nan: 0})

In [430]:
post_op_cat_manual['iabpind'].value_counts()

0.0    39854
1.0     1235
3.0      656
5.0      610
4.0      202
2.0      183
Name: iabpind, dtype: int64

In [431]:
post_op_cat_manual['iabpind'].dtypes

dtype('float64')

In [432]:
post_op_cat_manual.columns.tolist()[14]

'iabpwhen'

- `iabpwhen`

In [433]:
sorted(list(post_op_cat_manual['iabpwhen'].unique()))

[nan, 1.0, 2.0, 3.0]

In [434]:
post_op_cat_manual['iabpwhen'].value_counts()

1.0    2189
2.0     574
3.0     141
Name: iabpwhen, dtype: int64

In [435]:
post_op_cat_manual['iabpwhen'] = post_op_cat_manual['iabpwhen'].replace({1: 1,
                                                                         2: 2,
                                                                         3: 3,
                                                                         np.nan: 0})

In [436]:
post_op_cat_manual['iabpwhen'].value_counts()

0.0    39836
1.0     2189
2.0      574
3.0      141
Name: iabpwhen, dtype: int64

In [437]:
post_op_cat_manual['iabpwhen'].dtypes

dtype('float64')

In [438]:
post_op_cat_manual.columns.tolist()[15]

'mt30stat'

- `mt30stat`

In [439]:
sorted(list(post_op_cat_manual['mt30stat'].unique()))

[1.0, 2.0, 3.0, nan]

In [440]:
post_op_cat_manual['mt30stat'].value_counts()

1.0    41545
2.0      931
3.0      256
Name: mt30stat, dtype: int64

In [441]:
post_op_cat_manual['mt30stat'] = post_op_cat_manual['mt30stat'].replace({1: 1,
                                                                         2: 0,
                                                                         3: 0,
                                                                         np.nan: 0})

In [442]:
post_op_cat_manual['mt30stat'].value_counts()

1.0    41545
0.0     1195
Name: mt30stat, dtype: int64

In [443]:
post_op_cat_manual['mt30stat'].dtypes

dtype('float64')

In [444]:
post_op_cat_manual.columns.tolist()[16]

'mtcause'

- `mtcause`

In [445]:
sorted(list(post_op_cat_manual['mtcause'].unique()))

[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 700.0, 777.0]

In [446]:
post_op_cat_manual['mtcause'].value_counts()

1.0      459
6.0      184
700.0    162
777.0    108
2.0       97
5.0       67
4.0       26
3.0       16
Name: mtcause, dtype: int64

In [447]:
post_op_cat_manual['mtcause'] = post_op_cat_manual['mtcause'].replace({1: 1,
                                                                       2: 2,
                                                                       3: 3,
                                                                       4: 4,
                                                                       5: 5,
                                                                       6: 6,
                                                                       700: 0,
                                                                       777: 0,
                                                                       np.nan: 0})

In [448]:
post_op_cat_manual['mtcause'].value_counts()

0.0    41891
1.0      459
6.0      184
2.0       97
5.0       67
4.0       26
3.0       16
Name: mtcause, dtype: int64

In [449]:
post_op_cat_manual['mtcause'].dtypes

dtype('float64')

In [450]:
post_op_cat_manual.columns.tolist()[17]

'opcab'

- `opcab`

In [451]:
sorted(list(post_op_cat_manual['opcab'].unique()))

[1, 2, 3, 4]

In [452]:
post_op_cat_manual['opcab'].value_counts()

1    18857
3    15319
2     8547
4       17
Name: opcab, dtype: int64

In [453]:
post_op_cat_manual['opcab'] = post_op_cat_manual['opcab'].replace({1: 1,
                                                                   2: 0,
                                                                   3: 1,
                                                                   4: 1})

In [454]:
post_op_cat_manual['opcab'].value_counts()

1    34193
0     8547
Name: opcab, dtype: int64

In [455]:
post_op_cat_manual['opcab'].isnull().sum()

0

In [456]:
post_op_cat_manual['opcab'].dtypes

dtype('int64')

In [457]:
post_op_cat_manual.columns.tolist()[18]

'opvalve'

- `opvalve`

In [458]:
sorted(list(post_op_cat_manual['opvalve'].unique()))

[1, 2]

In [459]:
post_op_cat_manual['opvalve'].value_counts()

2    28848
1    13892
Name: opvalve, dtype: int64

In [460]:
post_op_cat_manual['opvalve'] = post_op_cat_manual['opvalve'].replace({1: 1,
                                                                       2: 0})

In [461]:
post_op_cat_manual['opvalve'].value_counts()

0    28848
1    13892
Name: opvalve, dtype: int64

In [462]:
post_op_cat_manual['opvalve'].isnull().sum()

0

In [463]:
post_op_cat_manual['opvalve'].dtypes

dtype('int64')

In [464]:
post_op_cat_manual.columns.tolist()[19]

'readmrsn'

- `readmrsn`

In [465]:
sorted(list(post_op_cat_manual['readmrsn'].unique()))

[nan,
 2.0,
 3.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 14.0,
 15.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 997.0,
 998.0,
 999.0]

In [466]:
post_op_cat_manual['readmrsn'].value_counts()

998.0    1255
2.0       682
3.0       635
999.0     478
7.0       442
5.0       193
31.0      188
28.0      184
6.0       162
23.0      128
9.0       127
29.0      109
27.0       87
18.0       82
21.0       77
30.0       74
15.0       50
14.0       35
19.0       26
32.0       16
22.0       15
20.0       10
997.0       8
24.0        7
8.0         7
25.0        1
Name: readmrsn, dtype: int64

In [467]:
post_op_cat_manual['readmrsn'].isnull().sum()

37662

In [468]:
post_op_cat_manual['readmrsn'] = post_op_cat_manual['readmrsn'].replace({2: 0,
                                                                         3: 0,
                                                                         5: 0,
                                                                         6: 0,
                                                                         7: 0,
                                                                         8: 0,
                                                                         9: 0,
                                                                         14: 0,
                                                                         15: 2,
                                                                         18: 1,
                                                                         19: 0,
                                                                         20: 0,
                                                                         21: 0,
                                                                         22: 0,
                                                                         23: 0,
                                                                         24: 0,
                                                                         25: 0,
                                                                         27: 0,
                                                                         28: 0,
                                                                         29: 0,
                                                                         30: 0,
                                                                         31: 0,
                                                                         32: 0,
                                                                         997: 0,
                                                                         998: 0, 
                                                                         999: 0,
                                                                         np.nan: 0})

In [469]:
post_op_cat_manual['readmrsn'].value_counts()

0.0    42608
1.0       82
2.0       50
Name: readmrsn, dtype: int64

In [470]:
post_op_cat_manual['readmrsn'].isnull().sum()

0

In [471]:
post_op_cat_manual['readmrsn'].value_counts()

0.0    42608
1.0       82
2.0       50
Name: readmrsn, dtype: int64

In [472]:
post_op_cat_manual.columns.tolist()[20]

'unplproc'

- `unplproc`

In [473]:
sorted(list(post_op_cat_manual['unplproc'].unique()))

[1.0, 2.0, nan, 3.0]

In [474]:
post_op_cat_manual['unplproc'].value_counts()

1.0    23389
3.0      108
2.0      100
Name: unplproc, dtype: int64

In [475]:
post_op_cat_manual['unplproc'].isnull().sum()

19143

In [476]:
post_op_cat_manual['unplproc'] = post_op_cat_manual['unplproc'].replace({1: 0,
                                                                         2: 1,
                                                                         3: 2,
                                                                         np.nan: 0})

In [477]:
post_op_cat_manual['unplproc'].value_counts()

0.0    42532
2.0      108
1.0      100
Name: unplproc, dtype: int64

In [478]:
post_op_cat_manual['unplproc'].isnull().sum()

0

In [479]:
post_op_cat_manual['unplproc'].dtypes

dtype('float64')

In [480]:
post_op_cat_manual.columns.tolist()[21]

'urgntrsn'

- `urgntrsn`

In [481]:
sorted(list(post_op_cat_manual['urgntrsn'].unique()))

[nan,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0]

In [482]:
post_op_cat_manual['urgntrsn'].value_counts()

5.0     3972
1.0     3259
6.0     2122
4.0      670
8.0      655
7.0      295
3.0      208
2.0      179
13.0      56
15.0      42
14.0      31
12.0      17
10.0      10
9.0        4
11.0       1
Name: urgntrsn, dtype: int64

In [483]:
post_op_cat_manual['urgntrsn'].isnull().sum()

31219

In [484]:
post_op_cat_manual['urgntrsn'] = post_op_cat_manual['urgntrsn'].replace({1: 1,
                                                                         2: 2,
                                                                         3: 3,
                                                                         4: 4,
                                                                         5: 5,
                                                                         6: 6,
                                                                         7: 7,
                                                                         8: 8,
                                                                         9: 9,
                                                                         10: 10,
                                                                         11: 11,
                                                                         12: 12,
                                                                         13: 13,
                                                                         14: 14,
                                                                         15: 15,
                                                                         np.nan: 0})

In [485]:
post_op_cat_manual['urgntrsn'].value_counts()

0.0     31219
5.0      3972
1.0      3259
6.0      2122
4.0       670
8.0       655
7.0       295
3.0       208
2.0       179
13.0       56
15.0       42
14.0       31
12.0       17
10.0       10
9.0         4
11.0        1
Name: urgntrsn, dtype: int64

In [486]:
post_op_cat_manual['urgntrsn'].isnull().sum()

0

In [487]:
post_op_cat_manual['urgntrsn'].dtypes

dtype('float64')

In [488]:
post_op_cat_manual.columns.tolist()[22]

'vadproc'

- `vadproc`

In [489]:
sorted(list(post_op_cat_manual['vadproc'].unique()))

[1.0, nan, 2.0, 5.0]

In [490]:
post_op_cat_manual['vadproc'].value_counts()

1.0    42686
5.0       35
2.0        9
Name: vadproc, dtype: int64

In [491]:
post_op_cat_manual['vadproc'].isnull().sum()

10

In [492]:
post_op_cat_manual['vadproc'] = post_op_cat_manual['vadproc'].replace({1: 0,
                                                                       2: 1,
                                                                       5: 1,
                                                                       np.nan: 0})

In [493]:
post_op_cat_manual['vadproc'].value_counts()

0.0    42696
1.0       44
Name: vadproc, dtype: int64

In [494]:
post_op_cat_manual['vadproc'].isnull().sum()

0

In [495]:
post_op_cat_manual['vadproc'].dtypes

dtype('float64')

In [496]:
post_op_cat_manual.columns.tolist()[23]

'valexppos2'

- `valexppos2`

In [497]:
sorted(list(post_op_cat_manual['valexppos2'].unique()))

[nan, 1.0]

In [498]:
post_op_cat_manual['valexppos2'].value_counts()

1.0    1
Name: valexppos2, dtype: int64

In [499]:
post_op_cat_manual['valexppos2'].isnull().sum()

42739

In [500]:
post_op_cat_manual['valexppos2'] = post_op_cat_manual['valexppos2'].replace({1: 1,
                                                                             np.nan: 0})

In [501]:
post_op_cat_manual['valexppos2'].value_counts()

0.0    42739
1.0        1
Name: valexppos2, dtype: int64

In [502]:
post_op_cat_manual['valexppos2'].isnull().sum()

0

In [503]:
post_op_cat_manual['valexppos2'].dtypes

dtype('float64')

In [504]:
post_op_cat_manual.columns.tolist()[24]

'vsavpr'

- `vsavpr`

In [505]:
sorted(list(post_op_cat_manual['vsavpr'].unique()))

[nan, 1.0, 2.0, 3.0, 6.0, 9.0]

In [506]:
post_op_cat_manual['vsavpr'].value_counts()

1.0    9577
9.0      10
6.0       1
3.0       1
2.0       1
Name: vsavpr, dtype: int64

In [507]:
post_op_cat_manual['vsavpr'].isnull().sum()

33150

In [508]:
post_op_cat_manual['vsavpr'] = post_op_cat_manual['vsavpr'].replace({1: 1,
                                                                     2: 0,
                                                                     3: 1,
                                                                     6: 0,
                                                                     9: 1,
                                                                     np.nan: 0})

In [509]:
post_op_cat_manual['vsavpr'].value_counts()

0.0    33152
1.0     9588
Name: vsavpr, dtype: int64

In [510]:
post_op_cat_manual['vsavpr'].isnull().sum()

0

In [511]:
post_op_cat_manual['vsavpr'].dtypes

dtype('float64')

In [512]:
post_op_cat_manual.columns.tolist()[25]

'vsmv'

- `vsmv`

In [513]:
sorted(list(post_op_cat_manual['vsmv'].unique()))

[1.0, nan, 2.0, 3.0, 4.0, 5.0]

In [514]:
post_op_cat_manual['vsmv'].value_counts()

2.0    9584
1.0    2410
3.0    1853
5.0      31
4.0      10
Name: vsmv, dtype: int64

In [515]:
post_op_cat_manual['vsmv'].isnull().sum()

28852

In [516]:
post_op_cat_manual['vsmv'] = post_op_cat_manual['vsmv'].replace({1: 1,
                                                                 2: 0,
                                                                 3: 1,
                                                                 4: 1,
                                                                 5: 1,
                                                                 np.nan: 0})

In [517]:
post_op_cat_manual['vsmv'].value_counts()

0.0    38436
1.0     4304
Name: vsmv, dtype: int64

In [518]:
post_op_cat_manual['vsmv'].isnull().sum()

0

In [519]:
post_op_cat_manual['vsmv'].dtypes

dtype('float64')

### Validating Coding of `post_op_cat_manual`

In [520]:
feature_name = []
orig_coding = []
new_coding = []
num_levels = []
dtypes = []

for column in post_op_cat_manual.columns.tolist():
    feature_name.append(column)
    orig_coding.append(sorted(orig_post_op_cat_manual[column].unique()))
    new_coding.append(sorted(post_op_cat_manual[column].unique()))
    num_levels.append(len(post_op_cat_manual[column].unique()))
    dtypes.append(post_op_cat_manual[column].dtype)

In [521]:
post_op_cat_manual_validate = pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, num_levels, dtypes)),
                                           columns = ['feature', 
                                                      'original_levels', 
                                                      'new_levels', 
                                                      'num_levels', 
                                                      'data_type'])

In [522]:
post_op_cat_manual_validate

Unnamed: 0,feature,original_levels,new_levels,num_levels,data_type
0,aortoccl,"[2.0, 3.0, 5.0, 6.0, nan]","[0.0, 1.0, 2.0]",3,float64
1,asmtaodx,"[6.0, nan, 1.0, 2.0, 3.0, 4.0, 5.0]","[0.0, 1.0, 2.0, 3.0, 4.0, 5.0]",6,float64
2,asmtascaa,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",2,float64
3,cathbasassistind,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0]","[0.0, 1.0, 2.0, 3.0]",4,float64
4,cathbasassistwhen,"[nan, 1.0, 2.0, 3.0]","[0.0, 1.0, 2.0, 3.0]",4,float64
5,cotafib,"[1, 2]","[0, 1]",2,int64
6,cotarrst,"[1, 2]","[0, 1]",2,int64
7,cpbutil,"[1, 2, 3]","[0, 1, 2]",3,int64
8,cperftyp,"[nan, 1.0, 2.0, 3.0]","[0.0, 1.0, 2.0, 3.0]",4,float64
9,ecmo,"[1.0, 2.0, nan, 3.0, 4.0]","[0.0, 1.0]",2,float64


In [523]:
post_op_cat_features.shape, post_op_cat_bulk.shape, post_op_cat_manual.shape

((42740, 56), (42740, 26), (42740, 26))

In [524]:
post_op_cat_features.shape[1] - post_op_cat_bulk.shape[1] - post_op_cat_manual.shape[1] - 4 # del cols above all `0`

0

### Creating `post_op_TREE_cat_features`
- this will be the `post-op` `categorical` features for use in `sklearn` `DecisionTree`
- using `pd.concat` to join `post_op_cat_bulk` with `post_op_cat_manual`

In [525]:
post_op_TREE_cat_features = pd.concat((post_op_cat_bulk,
                                       post_op_cat_manual),
                                      axis=1)

In [526]:
post_op_TREE_cat_features.head()

Unnamed: 0,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,aortoccl,asmtaodx,asmtascaa,cathbasassistind,cathbasassistwhen,cotafib,cotarrst,cpbutil,cperftyp,ecmo,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mt30stat,mtcause,opcab,opvalve,readmrsn,unplproc,urgntrsn,vadproc,valexppos2,vsavpr,vsmv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,2,0.0,0.0,0.0,0.0,4.0,3.0,1.0,1.0,0.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [527]:
post_op_TREE_cat_features.shape

(42740, 52)

In [528]:
sum(post_op_TREE_cat_features.isnull().sum())

0

### Separating `post_op_cat_manual` features that require `dummies` from those that are binary

In [529]:
cat_no_dummies = list(post_op_cat_manual_validate['feature'][post_op_cat_manual_validate['num_levels'] == 2])

In [530]:
cat_no_dummies

['asmtascaa',
 'cotafib',
 'cotarrst',
 'ecmo',
 'mt30stat',
 'opcab',
 'opvalve',
 'vadproc',
 'valexppos2',
 'vsavpr',
 'vsmv']

In [531]:
len(cat_no_dummies)

11

#### `post_op_cat_man_noD`

In [532]:
post_op_cat_man_noD = post_op_cat_manual.copy()[cat_no_dummies]

In [533]:
post_op_cat_manual.shape, post_op_cat_man_noD.shape, len(cat_no_dummies)

((42740, 26), (42740, 11), 11)

In [534]:
post_op_cat_man_noD.head()

Unnamed: 0,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv
0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0
1,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0
2,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0
3,0.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0
4,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0


#### `post_op_cat_man_Dummies`

In [535]:
cat_dummies = list(post_op_cat_manual_validate['feature'][post_op_cat_manual_validate['num_levels'] > 2])

In [536]:
len(cat_dummies)

15

In [537]:
len(cat_dummies) + len(cat_no_dummies), post_op_cat_manual.shape

(26, (42740, 26))

In [538]:
cat_dummies

['aortoccl',
 'asmtaodx',
 'cathbasassistind',
 'cathbasassistwhen',
 'cpbutil',
 'cperftyp',
 'ecmoind',
 'ecmowhen',
 'emergrsn',
 'iabpind',
 'iabpwhen',
 'mtcause',
 'readmrsn',
 'unplproc',
 'urgntrsn']

- creating `post_op_cat_man_Dummies` `DataFrame`

In [539]:
post_op_cat_man_Dummies = post_op_cat_manual.copy()[cat_dummies]

In [540]:
post_op_cat_manual.shape, post_op_cat_man_Dummies.shape, len(cat_dummies)

((42740, 26), (42740, 15), 15)

In [541]:
post_op_cat_man_Dummies.head()

Unnamed: 0,aortoccl,asmtaodx,cathbasassistind,cathbasassistwhen,cpbutil,cperftyp,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mtcause,readmrsn,unplproc,urgntrsn
0,1.0,1.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,2,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Making Dummies
- need to painfully recode to text first - helps with column names to make them human readable

- `aortoccl`

In [542]:
post_op_cat_man_Dummies['aortoccl'].value_counts()

1.0    39596
0.0     3030
2.0      114
Name: aortoccl, dtype: int64

In [543]:
post_op_cat_man_Dummies['aortoccl'] = post_op_cat_man_Dummies['aortoccl'].replace({0: 'NONE',
                                                                                   1: 'AoXC',
                                                                                   2: 'Balloon'})

In [544]:
post_op_cat_man_Dummies['aortoccl'].value_counts()

AoXC       39596
NONE        3030
Balloon      114
Name: aortoccl, dtype: int64

In [545]:
post_op_cat_man_Dummies.columns.tolist()[1]

'asmtaodx'

- `asmtaodx`

In [546]:
post_op_cat_man_Dummies['asmtaodx'].value_counts()

1.0    17629
0.0    16360
2.0     4192
3.0     3714
4.0      605
5.0      240
Name: asmtaodx, dtype: int64

In [547]:
post_op_cat_man_Dummies['asmtaodx'] = post_op_cat_man_Dummies['asmtaodx'].replace({0: 'NONE',
                                                                                   1: 'Normal',
                                                                                   2: 'Thickening',
                                                                                   3: 'atheroma<5mm',
                                                                                   4: 'atheroma>5mm',
                                                                                   5: 'mobile_plaques'})

In [548]:
post_op_cat_man_Dummies['asmtaodx'].value_counts()

Normal            17629
NONE              16360
Thickening         4192
atheroma<5mm       3714
atheroma>5mm        605
mobile_plaques      240
Name: asmtaodx, dtype: int64

In [549]:
post_op_cat_man_Dummies.columns.tolist()[2]

'cathbasassistind'

- `cathbasassistind`

In [550]:
post_op_cat_man_Dummies['cathbasassistind'].value_counts()

0.0    42689
1.0       44
2.0        5
3.0        2
Name: cathbasassistind, dtype: int64

In [551]:
post_op_cat_man_Dummies['cathbasassistind'] = post_op_cat_man_Dummies['cathbasassistind'].replace({0: 'NONE',
                                                                                                   1: 'BP_instability',
                                                                                                   2: 'CPB_wean',
                                                                                                   3: 'PCI_failure'})

In [552]:
post_op_cat_man_Dummies['cathbasassistind'].value_counts()

NONE              42689
BP_instability       44
CPB_wean              5
PCI_failure           2
Name: cathbasassistind, dtype: int64

In [553]:
post_op_cat_man_Dummies.columns.tolist()[3]

'cathbasassistwhen'

- `cathbasassistwhen`

In [554]:
post_op_cat_man_Dummies['cathbasassistwhen'].value_counts()

0.0    42684
1.0       25
3.0       24
2.0        7
Name: cathbasassistwhen, dtype: int64

In [555]:
post_op_cat_man_Dummies['cathbasassistwhen'] = post_op_cat_man_Dummies['cathbasassistwhen'].replace({0: 'NONE',
                                                                                                     1: 'Pre_OP',
                                                                                                     2: 'Intra_OP',
                                                                                                     3: 'Post_OP'})

In [556]:
post_op_cat_man_Dummies['cathbasassistwhen'].value_counts()

NONE        42684
Pre_OP         25
Post_OP        24
Intra_OP        7
Name: cathbasassistwhen, dtype: int64

In [557]:
post_op_cat_man_Dummies.columns.tolist()[4]

'cpbutil'

- `cpbutil`

In [558]:
post_op_cat_man_Dummies['cpbutil'].value_counts()

2    40447
0     2223
1       70
Name: cpbutil, dtype: int64

In [559]:
post_op_cat_man_Dummies['cpbutil'] = post_op_cat_man_Dummies['cpbutil'].replace({0: 'NONE',
                                                                                 1: 'Combination',
                                                                                 2: 'Full_CPB'})

In [560]:
post_op_cat_man_Dummies['cpbutil'].value_counts()

Full_CPB       40447
NONE            2223
Combination       70
Name: cpbutil, dtype: int64

In [561]:
post_op_cat_man_Dummies.columns.tolist()[5]

'cperftyp'

- `cperftyp`

In [562]:
post_op_cat_man_Dummies['cperftyp'].value_counts()

0.0    42728
1.0        8
2.0        3
3.0        1
Name: cperftyp, dtype: int64

In [563]:
post_op_cat_man_Dummies['cperftyp'] = post_op_cat_man_Dummies['cperftyp'].replace({0: 'NONE',
                                                                                   1: 'Antegrade',
                                                                                   2: 'Retrograde',
                                                                                   3: 'Both'})

In [564]:
post_op_cat_man_Dummies['cperftyp'].value_counts()

NONE          42728
Antegrade         8
Retrograde        3
Both              1
Name: cperftyp, dtype: int64

In [565]:
post_op_cat_man_Dummies.columns.tolist()[6]

'ecmoind'

- `ecmoind`

In [566]:
post_op_cat_man_Dummies['ecmoind'].value_counts()

0.0    42674
1.0       40
2.0       14
4.0       12
Name: ecmoind, dtype: int64

In [567]:
post_op_cat_man_Dummies['ecmoind'] = post_op_cat_man_Dummies['ecmoind'].replace({0: 'NONE',
                                                                                 1: 'Cadiac_Failure',
                                                                                 2: 'Resp_Failure',
                                                                                 4: 'Rescue_Salvage'})

In [568]:
post_op_cat_man_Dummies['ecmoind'].value_counts()

NONE              42674
Cadiac_Failure       40
Resp_Failure         14
Rescue_Salvage       12
Name: ecmoind, dtype: int64

In [569]:
post_op_cat_man_Dummies.columns.tolist()[7]

'ecmowhen'

- `ecmowhen`

In [570]:
post_op_cat_man_Dummies['ecmowhen'].value_counts()

0.0    42674
3.0       31
2.0       30
1.0        5
Name: ecmowhen, dtype: int64

In [571]:
post_op_cat_man_Dummies['ecmowhen'] = post_op_cat_man_Dummies['ecmowhen'].replace({0: 'NONE',
                                                                                   1: 'Pre_OP',
                                                                                   2: 'Intra_OP',
                                                                                   3: 'Post_OP'})

In [572]:
post_op_cat_man_Dummies['ecmowhen'].value_counts()

NONE        42674
Post_OP        31
Intra_OP       30
Pre_OP          5
Name: ecmowhen, dtype: int64

In [573]:
post_op_cat_man_Dummies.columns.tolist()[8]

'emergrsn'

- `emergrsn`

In [574]:
post_op_cat_man_Dummies['emergrsn'].value_counts()

0.0     42191
4.0       173
5.0       111
1.0        73
13.0       68
8.0        64
6.0        26
12.0       13
3.0        10
2.0         7
7.0         1
10.0        1
11.0        1
9.0         1
Name: emergrsn, dtype: int64

In [575]:
sorted(list(post_op_cat_man_Dummies['emergrsn'].unique()))

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]

In [576]:
post_op_cat_man_Dummies['emergrsn'] = post_op_cat_man_Dummies['emergrsn'].replace({0: 'NONE',
                                                                                   1: 'Shock_Circ_Supp',
                                                                                   2: 'Shock_No_Circ_Supp',
                                                                                   3: 'Pulm_Edema',
                                                                                   4: 'Evolvg_MI',
                                                                                   5: 'Ischemia',
                                                                                   6: 'Valve_Dysf',
                                                                                   7: 'Ao_Dissect',
                                                                                   8: 'Angio_Accident',
                                                                                   9: 'Trauma',
                                                                                   10: 'Infect_Device',
                                                                                   11: 'Syncope',
                                                                                   12: 'Hybrid',
                                                                                   13: 'Anatomy'})

In [577]:
post_op_cat_man_Dummies['emergrsn'].value_counts()

NONE                  42191
Evolvg_MI               173
Ischemia                111
Shock_Circ_Supp          73
Anatomy                  68
Angio_Accident           64
Valve_Dysf               26
Hybrid                   13
Pulm_Edema               10
Shock_No_Circ_Supp        7
Syncope                   1
Trauma                    1
Ao_Dissect                1
Infect_Device             1
Name: emergrsn, dtype: int64

In [578]:
post_op_cat_man_Dummies.columns.tolist()[9]

'iabpind'

- `iabpind`

In [579]:
post_op_cat_man_Dummies['iabpind'].value_counts()

0.0    39854
1.0     1235
3.0      656
5.0      610
4.0      202
2.0      183
Name: iabpind, dtype: int64

In [580]:
post_op_cat_man_Dummies['iabpind'] = post_op_cat_man_Dummies['iabpind'].replace({0: 'NONE',
                                                                                 1: 'Hemodyn_Instab',
                                                                                 2: 'Procedure_Support',
                                                                                 3: 'Angina',
                                                                                 4: 'CPB_Wean_Failure',
                                                                                 5: 'Prophylactic'})

In [581]:
post_op_cat_man_Dummies['iabpind'].value_counts()

NONE                 39854
Hemodyn_Instab        1235
Angina                 656
Prophylactic           610
CPB_Wean_Failure       202
Procedure_Support      183
Name: iabpind, dtype: int64

In [582]:
post_op_cat_man_Dummies.columns.tolist()[10]

'iabpwhen'

- `iabpwhen`

In [583]:
post_op_cat_man_Dummies['iabpwhen'].value_counts()

0.0    39836
1.0     2189
2.0      574
3.0      141
Name: iabpwhen, dtype: int64

In [584]:
post_op_cat_man_Dummies['iabpwhen'] = post_op_cat_man_Dummies['iabpwhen'].replace({0: 'NONE',
                                                                                   1: 'Pre_OP',
                                                                                   2: 'Intra_OP',
                                                                                   3: 'Post_OP'})

In [585]:
post_op_cat_man_Dummies['iabpwhen'].value_counts()

NONE        39836
Pre_OP       2189
Intra_OP      574
Post_OP       141
Name: iabpwhen, dtype: int64

In [586]:
post_op_cat_man_Dummies.columns.tolist()[11]

'mtcause'

- `mtcause`

In [587]:
post_op_cat_man_Dummies['mtcause'].value_counts()

0.0    41891
1.0      459
6.0      184
2.0       97
5.0       67
4.0       26
3.0       16
Name: mtcause, dtype: int64

In [588]:
post_op_cat_man_Dummies['mtcause'] = post_op_cat_man_Dummies['mtcause'].replace({0: 'NONE',
                                                                                 1: 'Cardiac',
                                                                                 2: 'Neuro',
                                                                                 3: 'Renal',
                                                                                 4: 'Vascular',
                                                                                 5: 'Infection',
                                                                                 6: 'Pulmonary'})

In [589]:
post_op_cat_man_Dummies['mtcause'].value_counts()

NONE         41891
Cardiac        459
Pulmonary      184
Neuro           97
Infection       67
Vascular        26
Renal           16
Name: mtcause, dtype: int64

In [590]:
post_op_cat_man_Dummies.columns.tolist()[12]

'readmrsn'

- `readmrsn`

In [591]:
post_op_cat_man_Dummies['readmrsn'].value_counts()

0.0    42608
1.0       82
2.0       50
Name: readmrsn, dtype: int64

In [592]:
post_op_cat_man_Dummies['readmrsn'] = post_op_cat_man_Dummies['readmrsn'].replace({0: 'NONE',
                                                                                   1: 'Stroke',
                                                                                   2: 'TIA'})

In [593]:
post_op_cat_man_Dummies['readmrsn'].value_counts()

NONE      42608
Stroke       82
TIA          50
Name: readmrsn, dtype: int64

In [594]:
post_op_cat_man_Dummies.columns.tolist()[13]

'unplproc'

- `unplproc`

In [595]:
post_op_cat_man_Dummies['unplproc'].value_counts()

0.0    42532
2.0      108
1.0      100
Name: unplproc, dtype: int64

In [596]:
post_op_cat_man_Dummies['unplproc'] = post_op_cat_man_Dummies['unplproc'].replace({0: 'NONE',
                                                                                   1: 'Yes_Disease',
                                                                                   2: 'Yes_Complication'})

In [597]:
post_op_cat_man_Dummies['unplproc'].value_counts()

NONE                42532
Yes_Complication      108
Yes_Disease           100
Name: unplproc, dtype: int64

In [598]:
post_op_cat_man_Dummies.columns.tolist()[14]

'urgntrsn'

- `urgntrsn`

In [599]:
post_op_cat_man_Dummies['urgntrsn'].value_counts()

0.0     31219
5.0      3972
1.0      3259
6.0      2122
4.0       670
8.0       655
7.0       295
3.0       208
2.0       179
13.0       56
15.0       42
14.0       31
12.0       17
10.0       10
9.0         4
11.0        1
Name: urgntrsn, dtype: int64

In [600]:
sorted(list(post_op_cat_man_Dummies['urgntrsn'].unique()))

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0]

In [601]:
post_op_cat_man_Dummies['urgntrsn'] = post_op_cat_man_Dummies['urgntrsn'].replace({0: 'NONE',
                                                                                   1: 'AMI',
                                                                                   2: 'IABP',
                                                                                   3: 'CP',
                                                                                   4: 'CHF',
                                                                                   5: 'Anatomy',
                                                                                   6: 'USA',
                                                                                   7: 'Rest_Angina',
                                                                                   8: 'Valve_Dysfunctn',
                                                                                   9: 'Ao_Dissect',
                                                                                   10: 'Angio_Accid',
                                                                                   11: 'Trauma',
                                                                                   12: 'Infect_Dev',
                                                                                   13: 'Syncope',
                                                                                   14: 'Hybrid',
                                                                                   15: 'PCI_Fail'})

In [602]:
post_op_cat_man_Dummies['urgntrsn'].value_counts()

NONE               31219
Anatomy             3972
AMI                 3259
USA                 2122
CHF                  670
Valve_Dysfunctn      655
Rest_Angina          295
CP                   208
IABP                 179
Syncope               56
PCI_Fail              42
Hybrid                31
Infect_Dev            17
Angio_Accid           10
Ao_Dissect             4
Trauma                 1
Name: urgntrsn, dtype: int64

### Checking Recoding of `post_op_cat_man_Dummies`

In [603]:
post_op_cat_man_Dummies.head()

Unnamed: 0,aortoccl,asmtaodx,cathbasassistind,cathbasassistwhen,cpbutil,cperftyp,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mtcause,readmrsn,unplproc,urgntrsn
0,AoXC,Normal,NONE,NONE,Full_CPB,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
1,AoXC,NONE,NONE,NONE,Full_CPB,NONE,NONE,NONE,Evolvg_MI,Angina,Pre_OP,NONE,NONE,NONE,NONE
2,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,AMI
3,AoXC,NONE,NONE,NONE,Full_CPB,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,AoXC,NONE,NONE,NONE,Full_CPB,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,AMI


In [604]:
post_op_cat_man_Dummies.shape

(42740, 15)

### Using `pd.get_dummies()` to create `dummy` variables - `post_op_cat_man_Dummies`

In [605]:
post_op_cat_man_Dummies_df = pd.get_dummies(post_op_cat_man_Dummies.copy())

In [606]:
post_op_cat_man_Dummies_df.head()

Unnamed: 0,aortoccl_AoXC,aortoccl_Balloon,aortoccl_NONE,asmtaodx_NONE,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_NONE,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_NONE,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cpbutil_NONE,cperftyp_Antegrade,cperftyp_Both,cperftyp_NONE,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_NONE,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_NONE,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_NONE,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_NONE,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_NONE,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_NONE,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_NONE,readmrsn_Stroke,readmrsn_TIA,unplproc_NONE,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_NONE,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [607]:
post_op_cat_man_Dummies_df.shape

(42740, 85)

### Getting Rid of Reference Classes

In [608]:
drop_cols = [col for col in post_op_cat_man_Dummies_df.columns if col.endswith('_NONE')]

In [609]:
drop_cols

['aortoccl_NONE',
 'asmtaodx_NONE',
 'cathbasassistind_NONE',
 'cathbasassistwhen_NONE',
 'cpbutil_NONE',
 'cperftyp_NONE',
 'ecmoind_NONE',
 'ecmowhen_NONE',
 'emergrsn_NONE',
 'iabpind_NONE',
 'iabpwhen_NONE',
 'mtcause_NONE',
 'readmrsn_NONE',
 'unplproc_NONE',
 'urgntrsn_NONE']

In [610]:
len(drop_cols)

15

- dropping the reference columns

In [611]:
post_op_cat_man_Dummies_df = post_op_cat_man_Dummies_df.drop(drop_cols, axis=1)

In [612]:
post_op_cat_man_Dummies_df.head()

Unnamed: 0,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [613]:
post_op_cat_man_Dummies_df.shape

(42740, 70)

### Putting Together `post_op_DUMMY_cat_features`

In [614]:
post_op_DUMMY_cat_features = pd.concat((post_op_cat_bulk,
                                        post_op_cat_man_noD,
                                        post_op_cat_man_Dummies_df),
                                       axis=1)

In [615]:
post_op_DUMMY_cat_features.head()

Unnamed: 0,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [616]:
post_op_DUMMY_cat_features.shape, post_op_cat_bulk.shape, post_op_cat_man_noD.shape, post_op_cat_man_Dummies_df.shape

((42740, 107), (42740, 26), (42740, 11), (42740, 70))

## Processing Post-OP Numerical Features
### Creating `post_op_num_date_deatures`

In [617]:
post_op_num_date_features = working_data.copy()[post_op_num_date]

In [618]:
post_op_num_date_features.head()

Unnamed: 0,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,mtdate,orexitdt,orentrydt
0,,,,,,,,,28.0,29.0,150.0,1.2,,,108.0,NaT,2011-07-01,2011-07-01
1,,,,,,,,,29.0,32.0,70.0,1.1,,,40.0,NaT,2011-07-03,2011-07-02
2,,,,,,,,,,,,1.4,,,,NaT,2011-07-04,2011-07-04
3,,,,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0,NaT,2011-07-05,2011-07-05
4,,,,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,NaT,2011-07-06,2011-07-06


In [619]:
post_op_num_date_features.shape

(42740, 18)

### Processing Post-OP `datetime` features

- `3` `datetime` features

In [620]:
post_op_date_cols = ['mtdate', 'orexitdt', 'orentrydt']

In [621]:
post_op_date_features = post_op_num_date_features.copy()[post_op_date_cols]

In [622]:
post_op_date_features.head()

Unnamed: 0,mtdate,orexitdt,orentrydt
0,NaT,2011-07-01,2011-07-01
1,NaT,2011-07-03,2011-07-02
2,NaT,2011-07-04,2011-07-04
3,NaT,2011-07-05,2011-07-05
4,NaT,2011-07-06,2011-07-06


In [623]:
post_op_date_features.shape, post_op_num_date_features.shape

((42740, 3), (42740, 18))

### Basic EDA on `post_op_date_features`

In [624]:
post_op_date_features.describe()

Unnamed: 0,mtdate,orexitdt,orentrydt
count,1120,42740,42740
unique,853,1957,1957
top,2014-12-16 00:00:00,2015-04-20 00:00:00,2015-04-20 00:00:00
freq,5,49,49
first,2011-07-13 00:00:00,2011-01-03 00:00:00,2011-01-03 00:00:00
last,2017-06-16 00:00:00,2016-12-31 00:00:00,2016-12-31 00:00:00


In [625]:
post_op_date_features.isnull().sum()

mtdate       41620
orexitdt         0
orentrydt        0
dtype: int64

#### Going to Drop `mtdate` because what date do you enter for people who did not die?

In [626]:
post_op_date_features = post_op_date_features.drop('mtdate',
                                                   axis=1)

post_op_date_features.head()

In [627]:
post_op_date_features.shape

(42740, 2)

- since we get `DayOfWeek`, `DayOfMonth` from `surgdt` it is redundant to get it from `orexitdt` and `orentrydt`
- But the `hour` the procedure started and the `duration` of the procedure probably would be useful information

In [628]:
post_op_date_features['OR_entry_Hour'] = post_op_date_features['orentrydt'].apply(lambda x: x.hour)

In [629]:
post_op_date_features.head()

Unnamed: 0,orexitdt,orentrydt,OR_entry_Hour
0,2011-07-01,2011-07-01,0
1,2011-07-03,2011-07-02,0
2,2011-07-04,2011-07-04,0
3,2011-07-05,2011-07-05,0
4,2011-07-06,2011-07-06,0


In [630]:
sum(post_op_date_features['OR_entry_Hour'])

0

- no time information embedded in the `orentrydt` and `orexitdt` features - therefore cannot create desired features from them
- going to drop all `post-op` `datetime` features

### Creating `post_op_numerical_features`

In [631]:
post_op_numerical_features = post_op_num_date_features.drop(post_op_date_cols,
                                                            axis=1)

In [632]:
post_op_numerical_features.head()

Unnamed: 0,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm
0,,,,,,,,,28.0,29.0,150.0,1.2,,,108.0
1,,,,,,,,,29.0,32.0,70.0,1.1,,,40.0
2,,,,,,,,,,,,1.4,,,
3,,,,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0
4,,,,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0


In [633]:
post_op_numerical_features.shape

(42740, 15)

### Basic EDA on `post_op_numerical_features`

In [634]:
post_op_numerical_features.describe()

Unnamed: 0,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm
count,12.0,4390.0,4391.0,46.0,10845.0,10848.0,10848.0,10848.0,41382.0,41281.0,40514.0,42689.0,4659.0,4667.0,39700.0
mean,39.16667,42.50547,39.77841,18.76087,0.12485,0.59154,0.82412,1.75756,25.39662,33.63306,113.07035,1.40144,64.41318,64.66381,85.43705
std,39.59301,173.617,162.3032,16.89337,0.97293,1.2953,1.75908,1.63742,4.95472,1.79123,48.84816,1.15617,10.79663,10.83362,39.18691
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.8,12.0,0.1,1.0,1.0,0.0
25%,14.75,0.0,0.0,3.25,0.0,0.0,0.0,1.0,22.0,32.8,80.0,0.9,58.0,58.0,58.0
50%,26.5,0.0,0.0,16.5,0.0,0.0,0.0,2.0,25.0,34.0,104.0,1.1,65.0,66.0,78.0
75%,44.25,6.75,6.0,24.75,0.0,0.0,1.0,2.0,29.0,34.9,136.0,1.4,72.0,72.0,104.0
max,134.0,3609.0,2783.0,68.0,65.0,24.0,72.0,30.0,92.0,40.0,691.0,29.0,95.0,95.0,419.0


- examining `NaN`s

In [635]:
post_op_numerical_features.isnull().sum()

cperftime      42728
cumulsatlft    38350
cumulsatrt     38349
dhcatm         42694
ibdcryou       31895
ibdffpu        31892
ibdplatu       31892
ibdrbcu        31892
lwsthct         1358
lwsttemp        1459
perfustm        2226
postcreat         51
prerso2lft     38081
prerso2rt      38073
xclamptm        3040
dtype: int64

- per the 11/2 Data Dictionary:
    - for `cumulsatlft` and `cumulsatrt` replace `NaN`s with `0`
    - for all other `numerical` features, replace `NaN` with training set `median`

#### Replacing `cumulsatlft` and `cumulsatrt` `NaN`s with `0`

In [636]:
post_op_numerical_features['cumulsatlft'] = post_op_numerical_features['cumulsatlft'].fillna(0)
post_op_numerical_features['cumulsatrt'] = post_op_numerical_features['cumulsatrt'].fillna(0)

In [637]:
post_op_numerical_features.head()

Unnamed: 0,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm
0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0
1,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0
2,,0.0,0.0,,,,,,,,,1.4,,,
3,,0.0,0.0,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0
4,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0


In [638]:
post_op_numerical_features['cumulsatlft'].isnull().sum()

0

In [639]:
post_op_numerical_features['cumulsatrt'].isnull().sum()

0

## Assembling Post-Op Dataset

In [640]:
post_op_numerical_features.shape, post_op_DUMMY_cat_features.shape, post_op_TREE_cat_features.shape

((42740, 15), (42740, 107), (42740, 52))

### Creating `POSTOP_dataset`
- contains `dummy` variables

In [641]:
POSTOP_dataset = pd.concat((post_op_numerical_features,
                            post_op_DUMMY_cat_features),
                          axis=1)

In [642]:
POSTOP_dataset.head()

Unnamed: 0,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,,0.0,0.0,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [643]:
POSTOP_dataset.shape

(42740, 122)

### Creating `POSTOP_dataset_TREE`
- for `sklearn` `DecisionTrees`

In [644]:
POSTOP_dataset_TREE = pd.concat((post_op_numerical_features,
                                 post_op_TREE_cat_features),
                                axis=1)

In [645]:
POSTOP_dataset_TREE.head()

Unnamed: 0,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,aortoccl,asmtaodx,asmtascaa,cathbasassistind,cathbasassistwhen,cotafib,cotarrst,cpbutil,cperftyp,ecmo,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mt30stat,mtcause,opcab,opvalve,readmrsn,unplproc,urgntrsn,vadproc,valexppos2,vsavpr,vsmv
0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,2,0.0,0.0,0.0,0.0,4.0,3.0,1.0,1.0,0.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,,0.0,0.0,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [646]:
POSTOP_dataset_TREE.shape

(42740, 67)

## Combining `PREOP` and `POSTOP` Datasets

### - Creating `PRE_plus_POST_dataset`
- contains `dummy variables`
- NOTE - `PREOP_dataset` has `outcome` vector and other `misc` columns that may not be included in feature matricies

In [647]:
PREOP_dataset.shape, POSTOP_dataset.shape

((42740, 117), (42740, 122))

In [648]:
PRE_plus_POST_dataset = pd.concat((PREOP_dataset,
                                   POSTOP_dataset),
                                  axis=1)

In [649]:
PRE_plus_POST_dataset.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,asmtascaa,cotafib,cotarrst,ecmo,mt30stat,opcab,opvalve,vadproc,valexppos2,vsavpr,vsmv,aortoccl_AoXC,aortoccl_Balloon,asmtaodx_Normal,asmtaodx_Thickening,asmtaodx_atheroma<5mm,asmtaodx_atheroma>5mm,asmtaodx_mobile_plaques,cathbasassistind_BP_instability,cathbasassistind_CPB_wean,cathbasassistind_PCI_failure,cathbasassistwhen_Intra_OP,cathbasassistwhen_Post_OP,cathbasassistwhen_Pre_OP,cpbutil_Combination,cpbutil_Full_CPB,cperftyp_Antegrade,cperftyp_Both,cperftyp_Retrograde,ecmoind_Cadiac_Failure,ecmoind_Rescue_Salvage,ecmoind_Resp_Failure,ecmowhen_Intra_OP,ecmowhen_Post_OP,ecmowhen_Pre_OP,emergrsn_Anatomy,emergrsn_Angio_Accident,emergrsn_Ao_Dissect,emergrsn_Evolvg_MI,emergrsn_Hybrid,emergrsn_Infect_Device,emergrsn_Ischemia,emergrsn_Pulm_Edema,emergrsn_Shock_Circ_Supp,emergrsn_Shock_No_Circ_Supp,emergrsn_Syncope,emergrsn_Trauma,emergrsn_Valve_Dysf,iabpind_Angina,iabpind_CPB_Wean_Failure,iabpind_Hemodyn_Instab,iabpind_Procedure_Support,iabpind_Prophylactic,iabpwhen_Intra_OP,iabpwhen_Post_OP,iabpwhen_Pre_OP,mtcause_Cardiac,mtcause_Infection,mtcause_Neuro,mtcause_Pulmonary,mtcause_Renal,mtcause_Vascular,readmrsn_Stroke,readmrsn_TIA,unplproc_Yes_Complication,unplproc_Yes_Disease,urgntrsn_AMI,urgntrsn_Anatomy,urgntrsn_Angio_Accid,urgntrsn_Ao_Dissect,urgntrsn_CHF,urgntrsn_CP,urgntrsn_Hybrid,urgntrsn_IABP,urgntrsn_Infect_Dev,urgntrsn_PCI_Fail,urgntrsn_Rest_Angina,urgntrsn_Syncope,urgntrsn_Trauma,urgntrsn_USA,urgntrsn_Valve_Dysfunctn
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.014,2,0,2,0,0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0.017,2,0,1,0,0,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0.045,2,0,1,0,0,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0.013,2,0,1,0,0,,0.0,0.0,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.0,1.0,1,1,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,0.016,2,0,2,0,0,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [650]:
PRE_plus_POST_dataset.shape

(42740, 239)

### - Creating `PRE_plus_POST_dataset_TREE`
- for `sklearn` `DecisionTrees`
- NOTE - `PREOP_dataset_sklearn` has `outcome` vector and other `misc` columns that may not be included in feature matricies

In [651]:
PREOP_dataset_sklearn.shape, POSTOP_dataset_TREE.shape

((42740, 79), (42740, 67))

In [652]:
PRE_plus_POST_dataset_TREE = pd.concat((PREOP_dataset_sklearn,
                                        POSTOP_dataset_TREE),
                                       axis=1)

In [653]:
PRE_plus_POST_dataset_TREE.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2,cperftime,cumulsatlft,cumulsatrt,dhcatm,ibdcryou,ibdffpu,ibdplatu,ibdrbcu,lwsthct,lwsttemp,perfustm,postcreat,prerso2lft,prerso2rt,xclamptm,canartstaort,canartstfem,canartstoth,canartstax,cathbasassist,ceroxused,circarr,cofirstind,concalc,cperfutil,IABP,ibldprod,imedeaca,imedtran,inoptee,mtopd,ocarasd,ocarvsd,opocard,oponcard,unplao,unplav,unplmv,unplvad,valexp2,vsmvpr,aortoccl,asmtaodx,asmtascaa,cathbasassistind,cathbasassistwhen,cotafib,cotarrst,cpbutil,cperftyp,ecmo,ecmoind,ecmowhen,emergrsn,iabpind,iabpwhen,mt30stat,mtcause,opcab,opvalve,readmrsn,unplproc,urgntrsn,vadproc,valexppos2,vsavpr,vsmv
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0,1,0.014,2,0,2,0,0,,0.0,0.0,,,,,,28.0,29.0,150.0,1.2,,,108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0,2,0.017,2,0,1,0,0,,0.0,0.0,,,,,,29.0,32.0,70.0,1.1,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,2,0.0,0.0,0.0,0.0,4.0,3.0,1.0,1.0,0.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,7,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,3,0,0,3,3,0,1,0,0,0,3,0.045,2,0,1,0,0,,0.0,0.0,,,,,,,,,1.4,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,7,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1,0,0,4,2,0,0,0,0,0,4,0.013,2,0,1,0,0,,0.0,0.0,,0.0,0.0,0.0,2.0,19.0,34.8,73.0,1.2,,,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,7,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,0,0,0,0,1,0,0,0,5,0.016,2,0,2,0,0,,0.0,0.0,,0.0,0.0,0.0,1.0,19.0,34.6,70.0,0.8,,,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [654]:
PRE_plus_POST_dataset_TREE.shape

(42740, 146)

## Pickling `PRE_plus_POST` datasets

- with `dummy` variables

In [655]:
#PRE_plus_POST_dataset.to_pickle('PRE_plus_POST_dataset_11_9.pkl')

- for `sklearn` `DecisionTrees`

In [656]:
#PRE_plus_POST_dataset_TREE.to_pickle('PRE_plus_POST_dataset_TREE_11_9.pkl')

### Should Save Key Column Name Lists
- PREOP numerical/categorical -- see line 256 for `DataFrames` to extract column names
- POSTOP numerical/categorical
- Other

- `PREOP_numerical_col_names` -- WORKS FOR `TREE` AS WELL

In [657]:
PREOP_numerical_col_names = numerical_features_df.columns.tolist()

In [658]:
len(PREOP_numerical_col_names), numerical_features_df.shape[1]

(11, 11)

#### How to `pickle` a `list` -- `PREOP_numerical_col_names`

with open('PREOP_numerical_col_names_11_9.pkl', 'wb') as filehandle:
    pickle.dump(PREOP_numerical_col_names, filehandle)

#### How to retrieve a `pickled` `list`

- `PREOP_categorical_col_names`

In [660]:
print (surgdt_dummies.shape,
       yes_no_unc_df.shape,
       compress_to_two_df.shape,
       recode_D_Dummies.shape,
       recode_D_P_Dummies.shape)

(42740, 19) (42740, 36) (42740, 12) (42740, 25) (42740, 7)


In [661]:
PREOP_categorical_col_names = (surgdt_dummies.columns.tolist() + 
                               yes_no_unc_df.columns.tolist() +
                               compress_to_two_df.columns.tolist() +
                               recode_D_Dummies.columns.tolist() +
                               recode_D_P_Dummies.columns.tolist())

In [662]:
len(PREOP_categorical_col_names), type(PREOP_categorical_col_names)

(99, list)

In [663]:
print (surgdt_dummies.shape[1] +
       yes_no_unc_df.shape[1] +
       compress_to_two_df.shape[1] +
       recode_D_Dummies.shape[1] +
       recode_D_P_Dummies.shape[1])

99


- `PREOP_outcome_other_col_names` -- WORKS FOR `TREE` AS WELL

In [665]:
outcome_other_col_names = outcome_other_df.columns.tolist()

In [666]:
len(outcome_other_col_names), outcome_other_df.shape

(7, (42740, 7))

- `PREOP_categorical_TREE_col_names`

In [668]:
print (surgdt_features_sklearn.shape,
       yes_no_unc_df.shape,
       compress_to_two_df.shape,
       recode_D_sklearn.shape,
       recode_D_P_sklearn.shape)

(42740, 3) (42740, 36) (42740, 12) (42740, 7) (42740, 3)


In [669]:
PREOP_categorical_TREE_col_names = (surgdt_features_sklearn.columns.tolist() +
                                    yes_no_unc_df.columns.tolist() +
                                    compress_to_two_df.columns.tolist() +
                                    recode_D_sklearn.columns.tolist() +
                                    recode_D_P_sklearn.columns.tolist())

In [670]:
len(PREOP_categorical_TREE_col_names)

61

In [671]:
print (surgdt_features_sklearn.shape[1] +
       yes_no_unc_df.shape[1] +
       compress_to_two_df.shape[1] +
       recode_D_sklearn.shape[1] +
       recode_D_P_sklearn.shape[1])

61


- `POSTOP_numerical_col_names`

In [673]:
POSTOP_numerical_col_names = post_op_numerical_features.columns.tolist()

In [674]:
len(POSTOP_numerical_col_names), post_op_numerical_features.shape

(15, (42740, 15))

- `POSTOP_categorical_col_names`

In [676]:
POSTOP_categorical_col_names = post_op_DUMMY_cat_features.columns.tolist()

In [677]:
len(POSTOP_categorical_col_names), post_op_DUMMY_cat_features.shape

(107, (42740, 107))

- `POSTOP_categorical_TREE_col_names`

In [679]:
POSTOP_categorical_TREE_col_names = post_op_TREE_cat_features.columns.tolist()

In [680]:
len(POSTOP_categorical_TREE_col_names), post_op_TREE_cat_features.shape

(52, (42740, 52))

#### `PREOP_ALL_col_names`

In [682]:
PREOP_ALL_col_names = PREOP_dataset.columns.tolist()

In [683]:
len(PREOP_ALL_col_names), PREOP_dataset.shape

(117, (42740, 117))

#### `PREOP_TREE_ALL_col_names`

In [685]:
PREOP_TREE_ALL_col_names = PREOP_dataset_sklearn.columns.tolist()

In [686]:
len(PREOP_TREE_ALL_col_names), PREOP_dataset_sklearn.shape

(79, (42740, 79))

#### `POSTOP_ALL_col_names`

In [688]:
POSTOP_ALL_col_names = POSTOP_dataset.columns.tolist()

In [689]:
len(POSTOP_ALL_col_names), POSTOP_dataset.shape

(122, (42740, 122))

#### `POSTOP_TREE_ALL_col_names`

In [691]:
POSTOP_TREE_ALL_col_names = POSTOP_dataset_TREE.columns.tolist()

In [692]:
len(POSTOP_TREE_ALL_col_names), POSTOP_dataset_TREE.shape

(67, (42740, 67))