## Capstone Project

### Pre-Operating Features Cleaning and Encoding 

#### - Combined `2.73` and `2.81`

#### - Dropping Reference Class From All Dummies

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# scikit-learn libraries for feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFECV

# scikit-learn libraries for learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.1
Running numpy version: 1.14.2
Running sklearn version: 0.21.3


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.DS_Store',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'PREOP_dataset_10_24.pkl',
 'PREOP_dataset_TREE_10_24.pkl',
 'X_A_DREF.pkl',
 'X_A_DREF_TREE_SKLEARN.pkl',
 'X_PREOP_10_24.pkl',
 'X_PREOP_TREE_10_24.pkl',
 'X_dev_A_DREF.pkl',
 'X_dev_A_DREF_TREE_SKLEARN.pkl',
 'X_dev_PREOP_10_24.pkl',
 'X_dev_PREOP_TREE_10_24.pkl',
 'X_dev_PREOP_UNPROC_10_24.pkl',
 'X_test_A_DREF.pkl',
 'X_test_A_DREF_TREE_SKLEARN.pkl',
 'X_test_PREOP_10_24.pkl',
 'X_test_PREOP_TREE_10_24.pkl',
 'X_test_PREOP_UNPROC_10_24.pkl',
 'X_train_A_DREF.pkl',
 'X_train_A_DREF_TREE_SKLEARN.pkl',
 'X_train_PREOP_10_24.pkl',
 'X_train_PREOP_TREE_10_24.pkl',
 'X_train_PREOP_UNPROC_10_24.pkl',
 'capstone_STS_risk_factor_features.xlsx',
 'capstone_cleaned_data.csv',
 'capstone_data-version-2.xlsx',
 'capstone_data.xlsx',
 'capstone_data_binarized_outc

#### Loading Dataset

In [7]:
raw_data = pd.read_pickle('capstone_data_binarized_outcome.pkl')

In [8]:
raw_data.head()

Unnamed: 0,recordId,age,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,raceother,ethnicity,surgdt,dischdt,heightcm,weightkg,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,chrlungd,ChrLungDType,hmo2,bdtx,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,cvd,cva,cvawhen,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,RFHemoglobin,hct,creatlst,totalbumin,a1clvl,hitanti,inr,meldscr,cigsmoker,cigsmokercurr,cvdcoma,cvdrind,cvdninvas,prcvint,prcab,prvalve,prvalveproc1,PrValveProc2,PrValveProc3,PrValveProc4,POC,POCInt1,POCInt2,POCInt3,POCInt4,POCInt5,poarr,poco,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythPPaced,ArrhythVV,ArrhythAFlutter,ArrhythAFib,ArrhythAtrFib,ArrhythAFibDur,arrhythwhen,arrhyafib,arrhyafibty,medacei48,medasa,medgp,medgpmn,medacoag,medacmn,medaplt5days,medcoum,MedCoum5Days,MedCoum5Dis,MedXaInhibitors,MedXa5Days,MedNOAC5Days,MedNOACDisc,MedThrombinIn,MedThromIn5Days,MedThromInDisc,medthrom,medinotr,medlipid,medster,numdisv,PctStenLMain,SyntaxScrKnown,SyntaxScr,hdefd,hdef,DimAvail,lvsd,lvedd,pasysmeas,pasys,vdaort,vdstena,AoHemoDatAvail,VDAoVA,vdgrada,VDAoEt1,VDAoEt2,VDAoEt3,VDAoEt4,VDAoEt5,VDAoPrimEt,VDAoSievers,vdinsufm,vdstenm,vdmva,vdgradm,VDMiEt1,VDMiEt2,VDMiEt3,VDMiPrimEt,VDMiLes1,VDMiLes2,VDMiLes3,VDMiPrimLes,vdinsuft,ADPres,ADLocRoot,ADLocAsc,ADLocArch,ADLocDesThor,ADLocThora,ADLesTAneur,ADLesTCoarcNar,ADLesTRup,ADLesTPseudo,ADLesTPenUlcer,ADLesTIntraHema,ADLesTDis,ADLesTDisTmg,ADLesTDisTy,ADEt1,ADEt2,ADEt3,hdefmeth,vdaoet,vdendab,vdcongent,vdprimao,vdlvoutob,vdaorttumor,vdmitpmr,vdmitet,vdmitdegloc,vdmitandegdis,vdmitisty,vdmittumor,hdpad,hdpamean,incidenc,status,UrgEmergRsn,opapp,robotic,RobotTim,opcab,opvalve,vadproc,opocard,oponcard,orentrydt,orexitdt,GenAnes,ProcSed,Intubate,TempMeas,lwsttemp,LwstTempSrc,lwsthct,HighIntraGlu,cpbutil,cpbcmb,cpbcmbr,canartstfem,canartstaort,canartstax,CanArtStInn,canartstoth,perfustm,circarr,dhcatm,cperfutil,cperftime,cperftyp,TotCircArrTm,aortoccl,xclamptm,cplegiadeliv,cplegiatype,ceroxused,concalc,asmtascaa,AsmtAoDxMeth,asmtaodx,asmtapln,ibldprodref,ibldprod,ibdrbcu,ibdffpu,ibdplatu,ibdcryou,IntraClotFact,IntraopProComCon,imedeaca,imedtran,inoptee,prepar,PRepEF,PPEFMeas,PPEF,CombCardPCI,CombProcs,CombProcsStatus,CombProcsPCI,CombProcsStentTy,PPPlanedPCI,ValExpPos,ValExpTyp,ValExpDev,valexp2,valexppos2,ValExpTyp2,valexpdev2,urgntrsn,emergrsn,unplproc,unplav,unplmv,unplao,unplvad,unploth,prerso2lft,prerso2rt,cumulsatlft,cumulsatrt,cofirstind,ibdfactorvii,vad,imedaprot,imedaprotd,imeddesmo,SIStartT,sistopt,afibproc,IABP,iabpwhen,iabpind,inother,opaortic,opmitral,CABHybrPCI,vsavpr,vstcv,vstcvr,VSAVSurgRep,VSAVSurgType,VSAVSurgBioT,VSAVRoot,VSAVRootOReimp,VSAVRootOReimpTy,VSAVRepBioTy,vsmv,vsmvpr,VSTCVMit,MitralImplant,vsmiim,VSTV,VSTrRepair,vsaoimty,cathbasassist,CathBasAssistTy,cathbasassistwhen,cathbasassistind,ecmo,ecmowhen,ecmoind,CompMAD,CompMAD1,CompMAD2,CompMAD3,OCarASDPFO,OCarASDSec,OCarAAProc,OCarAAMeth,OCarAAModel,OCarAAUDI,ocaracd,ocarlva,ocarsvr,ocarvsd,AortProcRoot,AortProcAsc,AortProcHemi,AortProcTotArch,AortProcDesProx,SynthGftEleph,ocarasd,ocarasdty,ocarafibsur,ocarafibsurloc,ocarafibsurlaa,EndovasTAVR,mt30stat,mt30statmeth,mtdate,mtcause,mtopd,mtdcstat,mortalty,mtlocatn,disloctn,dcasa,DCOthAntiplat,dcdirthromin,dccoum,DCFactorXa,DCNovOrAnti,DCOthAnticoag,PostOpPeakGlu,postcreat,PostopHemoglobin,PostopHct,reintub,PostopIntub,popttech,popefd,SurSInf,complics,coprebld,coprebldtim,copregft,CReintMI,CReintMIVes,CReintMIIntTy,CAortReint,CAortReintTy,copreoth,coprenon,csepsis,csepsispbc,cnstrokp,cnstrokttia,cncomaenceph,CNEnceph,cncoma,cnparal,cnparesis,CNParesisTy,cpvntlng,cppneum,cvte,pulmemb,dvt,crenfail,crendial,dialdur,DialStat,cultrafil,cotarrst,CVaAoDisTy,cotcoag,cottamp,cotgi,COtLiver,cotmsf,cotafib,cotother,Readmit,ReadmitDt,readmrsn,cnstroktrind,CNStrokT,drgnum,BldRBC,predstro,predrenf,strokeBin
0,1,54,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-01,2011-07-06,180.0,117.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,43.0,0.9,3.8,7.2,3.0,1.0,6.5,1.0,1.0,,,,2.0,,,,,,,,,,,,,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,47.0,,45.0,50.0,1.0,42.0,1.0,2.0,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,1.0,,,,,,,1.0,,1.0,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-01,2011-07-01,,,,,29.0,,28.0,,3,,,2.0,1.0,2.0,,2.0,150.0,2.0,,,,,,2.0,108.0,2.0,1.0,2.0,2.0,1.0,,6.0,2.0,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.014,0.048,0
1,2,65,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-02,2011-07-09,175.3,79.4,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,45.0,1.2,,,3.0,1.0,,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,2.0,1.0,2,,1.0,2.0,2,2.0,,,,,,,,,,1.0,2.0,2.0,2.0,4.0,,,,1.0,55.0,,44.0,32.0,1.0,40.0,1.0,2.0,,,,,,,,,,,3.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,5.0,10.0,,,,,,,10.0,,,,,,,1.0,3.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-02,2011-07-03,,,,,32.0,,29.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,4.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,3.0,,,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,1.0,2.0,,,,,1.1,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,,0.017,0.069,0
2,3,83,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-04,2011-07-12,162.60001,102.1,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,1.0,2.0,1.0,1.0,2.0,1.0,1.0,,,2.0,,29.0,1.2,3.3,6.2,3.0,1.0,8.6,2.0,,,,,1.0,2.0,2.0,,,,,,,,,,,,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,3.0,2.0,,1.0,1.0,2,,2.0,,1,2.0,,,,,,,,,,2.0,1.0,1.0,2.0,4.0,,,,1.0,60.0,,31.0,50.0,1.0,36.0,1.0,1.0,,1.5,16.0,,,,,,,,3.0,2.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-04,2011-07-04,,,,,,,,,1,,,,,,,,,2.0,,,,,,5.0,,1.0,,2.0,2.0,2.0,,,,2.0,2.0,,,,,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,2.0,1.0,,2.0,2.0,,,,,1.4,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2.0,,NaT,,,,,,0.045,0.148,0
3,4,59,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-05,2011-07-09,160.0,127.5,1.0,4.0,2.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,,2.0,2.0,2.0,,,,,,,,,35.0,0.9,3.5,7.4,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,1.0,2.0,2,,2.0,,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,2.0,,,,1.0,60.0,,33.0,51.0,1.0,35.0,2.0,,,,,,,,,,,,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,1.0,,1.0,2.0,,1,1,1.0,2.0,2.0,2011-07-05,2011-07-05,,,,,34.8,,19.0,,3,,,2.0,1.0,2.0,,2.0,73.0,2.0,,,,,,2.0,47.0,3.0,1.0,2.0,2.0,2.0,,,,,1.0,2.0,0.0,0.0,0.0,,,2.0,2.0,1.0,1.0,,,,,,,,,,,,,2.0,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,1.0,1.0,,,329.0,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,1.2,,,2.0,,2.0,2.0,2.0,1.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.013,0.074,0
4,5,72,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2011-07-06,2011-07-10,160.0,64.0,2.0,,1.0,2.0,1,2.0,,,1.0,,2.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,,2.0,2.0,2.0,,,,,,,,,37.0,0.9,3.8,5.7,3.0,1.0,6.4,2.0,,,,,2.0,,,,,,,,,,,,,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,,,,1.0,,,2.0,1.0,2,,1.0,1.0,2,2.0,,,,,,,,,,2.0,2.0,1.0,2.0,4.0,,,,1.0,60.0,,21.0,40.0,1.0,40.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,1.0,2.0,,1.0,2.0,,1,2,1.0,2.0,2.0,2011-07-06,2011-07-06,,,,,34.6,,19.0,,3,,,2.0,1.0,2.0,,2.0,70.0,2.0,,,,,,2.0,40.0,2.0,1.0,2.0,2.0,2.0,,,,,1.0,1.0,0.0,0.0,0.0,,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,0.0,,,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,NaT,,,1,2.0,,1.0,1.0,,2.0,2.0,,,,,0.8,,,2.0,,2.0,2.0,,2.0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2.0,,NaT,,,,,,0.016,0.019,0


In [9]:
raw_data.shape

(42746, 409)

In [10]:
col_names = raw_data.columns.tolist()

In [11]:
col_names.sort()

In [12]:
col_names[0:5]

['ADEt1', 'ADEt2', 'ADEt3', 'ADLesTAneur', 'ADLesTCoarcNar']

#### Categorical Features
- Working list prior to 10/22/19

#### Pre-Op Categorical Features 
- as of 10/23/19

In [13]:
cat_features = ['gender',
                'racecaucasian',
                'raceblack',
                'raceasian',
                'racenativeam',
                'racnativepacific',
                'ethnicity',

                'diabetes',
                'diabctrl', 
                'dyslip',
                'dialysis',
                'hypertn', 
                'infendo',
                'infendty',

                'TobaccoUse', #combine with `cigsmoker` - PER 10-24
                'cigsmoker',  #combine with `TobaccoUse` - PER 10-24

                'chrlungd', 

                'hmo2', #CHANGING CODING PER 10-26 GUIDE
                'slpapn', 
                'ivdrugab', 
                'alcohol', 
                'liverdis',  
                'immsupp', 
                'mediastrad',  
                'cancer', 
                'pvd',  

                'syncope', 
                'unrespstat',  
                'cvd', 
                'cva', 

                'cvdtia', 

                'cvdcarsten', 

                'cvdstenrt', #H - RECODED PER 10-26 CODING GUIDE
                'cvdstenlft', #H - RECODED PER 10-26 CODING GUIDE
                'cvdpcarsurg', 
                'hitanti',
                'prcvint', 
                'prcab',
                'prvalve',

                'CardSympTimeOfAdm', #H -- BUT ALL NaNs in 2.73 -- DELETED 10-24
                'CardSympTimeOfSurg', #H -- BUT ALL NaNs in 2.73 -- DELETED 10-24

                'anginalclass', # RECODED PER 10-26 CODING GUIDE
                'chf', 
                'classnyh', # RECODED PER 10-26 CODING GUIDE
                'priorhf', 
                'carshock', 
                'resusc', 

                'Arrhythmia', ## -- NO ANALOG IN 2.73 -- ALL NaNs -- DELETED 10-24

                'ArrhythAFlutter', ## does not make sense to combine with 'ArrhythAFib' -- DELETED 10-24
                'ArrhythAFib', ## does not make sense to combine with 'ArrhythAFlutter' -- DELETED 10-24

                'ArrhythAFibDur', #GOING TO DELETE PER 10-26 GUIDE
                'arrhythwhen',    #GOING TO DELETE PER 10-26 GUIDE

                'arrhyafib',

                'medasa', 
                'medaplt5days', 
                'medinotr',
                'medlipid', 
                'numdisv', # CHANGING CODING TO MULTI-LEVEL PER 10-26 CODING GUIDE
                'hdefd',

                'vdaort',
                'vdstena',
                'vdinsufm', 
                'vdstenm', 
                'vdinsuft', 
                'incidenc', 
                'status']

In [14]:
len(cat_features)

66

#### Pre-Op Numerical Features
- as of 10/23/19

In [15]:
num_features = ['age',
                'heightcm',
                'weightkg',
                'hct',
                'creatlst',
                'totalbumin',
                'a1clvl',
                'meldscr',
                'hdef',
                'pasys']

In [16]:
len(num_features)

10

#### Pre-Op Date Features
- as of 10/23/19

In [17]:
date_features = ['surgdt']

In [18]:
len(date_features)

1

#### Outcome and Other Features
- as of 10/23/19

In [19]:
outcome_other = ['recordId', # keeping for now for auditing purposes
                 'predstro', # STS predicted probability of stroke
                 'cnstrokp',
                 'cnstrokttia',
                 #'cnstroktrind', # FOUND TO BE ALL `NaN`
                 'cncomaenceph',
                 'strokeBin'] # adding strokeBin to compare STS model prediction to actual outcome

In [20]:
len(outcome_other)

6

## Data Cleaning

### Step 1. Deleting Rows with `NaN`s in Essential Columns

In [21]:
working_data = raw_data.copy()[(raw_data['gender'].notnull()) & 
                               (raw_data['heightcm'].notnull()) & 
                               (raw_data['weightkg'].notnull())]

In [22]:
working_data.shape

(42740, 409)

- saving `working_data` as a `.pkl` File

In [23]:
# working_data.to_pickle('capstone_data_key_variable_nulls_cleaned_10_23.pkl')

### Step 2. Creating `pre_op_data`

In [24]:
pre_op_cols = num_features + date_features + cat_features + outcome_other

In [25]:
len(pre_op_cols)

83

In [26]:
pre_op_data = working_data.copy()[pre_op_cols]

In [27]:
pre_op_data.head()

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1,2.0,,,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,3.0,2.0,,,,,4.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,1.0,1.0,1,0.014,2,2,2,0
1,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,2011-07-02,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,3.0,1.0,2.0,2.0,,,5.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,2,2.0,2.0,4.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,3.0,2,0.017,2,2,1,0
2,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,2011-07-04,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,,,2.0,3.0,1.0,2.0,2.0,,,1.0,2.0,,2.0,2.0,2.0,,,,,3.0,2.0,1.0,1,1.0,1.0,4.0,1.0,1.0,1.0,3.0,2.0,3.0,1.0,2.0,3,0.045,2,2,1,0
3,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,2011-07-05,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,2.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,1.0,2.0,2.0,2.0,,,,,,,3.0,2.0,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,1.0,,2.0,2,2.0,1.0,2.0,1.0,2.0,,4.0,2.0,2.0,1.0,1.0,4,0.013,2,2,1,0
4,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,2011-07-06,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,2.0,1,2.0,,,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,3.0,2.0,,,,,5.0,2.0,,2.0,2.0,2.0,,,,,1.0,,1.0,2,2.0,1.0,4.0,1.0,2.0,,,,,1.0,2.0,5,0.016,2,2,2,0


In [28]:
pre_op_data.shape

(42740, 83)

- making a copy of `pre_op_df` for check if recoding was done correctly

In [29]:
orig_pre = pre_op_data.copy()

In [30]:
orig_pre.shape, pre_op_data.shape

((42740, 83), (42740, 83))

### Step 3. Recoding Columns per Data Dictionary (as of 10/22/19)

#### - recoding `yes_no_unc` columns

In [31]:
yes_no_unc = ['gender',
              'racecaucasian',
              'raceblack',
              'raceasian',
              'racenativeam',
              'racnativepacific',
              'ethnicity',
              'diabetes',
              'dyslip',
              'dialysis',
              'hypertn',
              'infendo',
              'slpapn',
              'liverdis',
              'immsupp',
              'mediastrad',
              'cancer',
              'pvd',
              
              'syncope',
              'unrespstat',
              'cvd',
              'cva',
              'cvdtia',
              'cvdpcarsurg',
              'hitanti',
              'cigsmoker',
             
              'prcvint',
              'prcab',
              'prvalve',
              'chf',
              'priorhf',
              #'Arrhythmia', ## Removing because all `NaN` for 2.73 - no way to harmonize with `2.81`
              'arrhyafib',
              'medinotr',
              'hdefd',
              'vdaort',
              'vdstena',
              'vdstenm']

In [32]:
len(yes_no_unc)

37

In [33]:
for column in yes_no_unc:
    pre_op_data[column] = pre_op_data[column].replace({1: 1, 2: 0, 3: 0, np.nan: 0})

- there should be no `NaN`s

In [34]:
pre_op_data[yes_no_unc].isnull().sum()

gender              0
racecaucasian       0
raceblack           0
raceasian           0
racenativeam        0
racnativepacific    0
ethnicity           0
diabetes            0
dyslip              0
dialysis            0
hypertn             0
infendo             0
slpapn              0
liverdis            0
immsupp             0
mediastrad          0
cancer              0
pvd                 0
syncope             0
unrespstat          0
cvd                 0
cva                 0
cvdtia              0
cvdpcarsurg         0
hitanti             0
cigsmoker           0
prcvint             0
prcab               0
prvalve             0
chf                 0
priorhf             0
arrhyafib           0
medinotr            0
hdefd               0
vdaort              0
vdstena             0
vdstenm             0
dtype: int64

- checking recoding against original in `orig_pre`

In [35]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in yes_no_unc:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_data[column].unique()))
    dtypes.append(pre_op_data[column].dtype)

In [36]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'data_type'])

Unnamed: 0,feature,original_levels,new_levels,data_type
0,gender,"[1.0, 2.0]","[0.0, 1.0]",float64
1,racecaucasian,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
2,raceblack,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
3,raceasian,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
4,racenativeam,"[1.0, 2.0, nan]","[0.0, 1.0]",float64
5,racnativepacific,"[2.0, nan, 1.0]","[0.0, 1.0]",float64
6,ethnicity,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64
7,diabetes,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64
8,dyslip,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64
9,dialysis,"[1.0, 2.0, nan, 3.0]","[0.0, 1.0]",float64


#### - recoding `compress_to_two` columns
- compressing from > 3 original categories to `Yes`/`No`, `1`/`0`

In [37]:
compress_to_two = ['diabctrl',
                   'infendty',
                   'TobaccoUse',
                   'chrlungd',
                   'hmo2', # going to be recoded per 10-26
                   'ivdrugab',
                   'alcohol',
                   #'cvawhen', # deleted per 10/22 changes - 0.95 correlation to `cva`
                   'carshock',
                   'resusc',
                   'medasa',
                   'medaplt5days',
                   'medlipid']
                   #'numdisv' # recoded from binary variable to multi-level per 10-26 GUIDE

In [38]:
len(compress_to_two)

12

- creating `list` of `replacement_dicts`

In [39]:
replacement_dicts = [{1: 0, 2: 0, 3: 0, 4: 1, 5: 0, 6: 0, 7: 0, np.nan: 0}, #diabctrl
                     {1: 0, 2: 1, 3: 0, np.nan: 0}, #infendty
                     {1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, np.nan: 0}, #TobaccoUse
                     {1: 0, 2: 0, 3: 1, 4: 1, 5: 0, 6: 0, np.nan: 0}, #chrlungd
                     {1: 1, 2: 0, 3: 1, 4: 1, 5: 0, np.nan: 0}, #hmo2 - CHANGED PER 10-26 GUIDE, 1 now maps to 1
                     {1: 0, 2: 0, 3: 0, 4: 1, 5: 0, np.nan: 0}, #ivdrugab
                     {1: 0, 2: 0, 3: 1, 4: 0, 5: 0, np.nan: 0}, #alcohol
                     #{1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}, #cvawhen
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #carshock -- RENAME to `carshock24`
                     {1: 0, 2: 0, 3: 1, 4: 1, np.nan: 0}, #resusc -- RENAME to `resusc24`
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medasa
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}, #medaplt5days
                     {1: 1, 2: 0, 3: 0, 4: 0, np.nan: 0}] #medlipid
                     #{1: 0, 2: 1, 3: 1, 4: 1, np.nan: 0}] #numdisv - going to go from binary to multi-level

In [40]:
print (len(compress_to_two))
print (len(replacement_dicts))

12
12


- since we need to rename columns, before recoding, want to keep orignal coding for auditing purposes

In [41]:
pre_op_data.shape

(42740, 83)

In [42]:
pre_op_data['carshock_orig'] = pre_op_data['carshock']
pre_op_data['resusc_orig'] = pre_op_data['resusc']

In [43]:
# added two columns
pre_op_data.shape

(42740, 85)

- now recoding the features in `compress_to_two`

In [44]:
name_replacement_zip = list(zip(compress_to_two, replacement_dicts))

- iterate through `name_replacement_zip` and apply `replacement_dicts` to features in `compress_to_two`

In [45]:
for column, dictionary in name_replacement_zip:
    pre_op_data[column] = pre_op_data[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [46]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in compress_to_two:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(pre_op_data[column].unique()))
    dtypes.append(pre_op_data[column].dtype)

In [47]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'dtypes'])

Unnamed: 0,feature,original_levels,new_levels,dtypes
0,diabctrl,"[1.0, nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[0.0, 1.0]",float64
1,infendty,"[nan, 1.0, 2.0]","[0.0, 1.0]",float64
2,TobaccoUse,"[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]","[0.0, 1.0]",float64
3,chrlungd,"[1.0, 2.0, 3.0, 4.0, nan, 5.0, 6.0]","[0.0, 1.0]",float64
4,hmo2,"[1.0, 2.0, 3.0, 4.0, nan, 5.0]","[0.0, 1.0]",float64
5,ivdrugab,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[0.0, 1.0]",float64
6,alcohol,"[1.0, 2.0, 3.0, nan, 4.0, 5.0]","[0.0, 1.0]",float64
7,carshock,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]",float64
8,resusc,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]",float64
9,medasa,"[1.0, 2.0, 3.0, 4.0, nan]","[0.0, 1.0]",float64


### Combining `TobaccoUse` and `cigsmoker`
- `TobaccoUse` is all `NaN` for `2.73` observations

In [48]:
pre_op_data['TobaccoUse'].value_counts()

0.0    30135
1.0    12605
Name: TobaccoUse, dtype: int64

In [49]:
pre_op_data['cigsmoker'].value_counts()

0.0    36074
1.0     6666
Name: cigsmoker, dtype: int64

- `TobaccoUse` and `cigsmoker` do not overlap so it makes sense to combine

In [50]:
pre_op_data[(pre_op_data['TobaccoUse'] == 1) & (pre_op_data['cigsmoker'] == 1)] 

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,carshock_orig,resusc_orig


#### Creating `Tobacco_smoker_combo`

In [51]:
pre_op_data['Tobacco_Combined'] = pre_op_data['TobaccoUse'] + pre_op_data['cigsmoker']

In [52]:
pre_op_data['Tobacco_Combined'].value_counts()

0.0    23469
1.0    19271
Name: Tobacco_Combined, dtype: int64

#### Editing `yes_no_unc` and `compress_to_two` feature lists to reflect new feature combination - `Tobacco_Combined`

In [53]:
yes_no_unc = ['gender',
              'racecaucasian',
              'raceblack',
              'raceasian',
              'racenativeam',
              'racnativepacific',
              'ethnicity',
              'diabetes',
              'dyslip',
              'dialysis',
              'hypertn',
              'infendo',
              'slpapn',
              'liverdis',
              'immsupp',
              'mediastrad',
              'cancer',
              'pvd',
              
              'syncope',
              'unrespstat',
              'cvd',
              'cva',
              'cvdtia',
              'cvdpcarsurg',
              'hitanti',
              #'cigsmoker', # combined with `TobaccoUse` in `compress_to_two`
             
              'prcvint',
              'prcab',
              'prvalve',
              'chf',
              'priorhf',
              #'Arrhythmia', ## Removing because all `NaN` for 2.73 - no way to harmonize with `2.81`
              'arrhyafib',
              'medinotr',
              'hdefd',
              'vdaort',
              'vdstena',
              'vdstenm']

In [54]:
len(yes_no_unc)

36

In [55]:
compress_to_two = ['diabctrl',
                   'infendty',
                   'Tobacco_Combined',
                   #'TobaccoUse', # combined with `cigsmoker` in `yes_no_unc`
                   'chrlungd',
                   'hmo2',
                   'ivdrugab',
                   'alcohol',
                   #'cvawhen', # deleted per 10/22 changes - 0.95 correlation to `cva`
                   'carshock',
                   'resusc',
                   'medasa',
                   'medaplt5days',
                   'medlipid']
                   # 'numdisv'# recoding from binary to multi-level per 10-26 GUIDE

In [56]:
len(compress_to_two)

12

In [57]:
len(yes_no_unc) + len(compress_to_two)

48

### Recoding `recode_D` Features - Will Need to Specify a Reference Class when Create Dummies

In [58]:
recode_D = ['numdisv', # recoding from binary to multi-level per 10-26 GUIDE
            'anginalclass',
            'classnyh',
            'vdinsufm',
            'vdinsuft',
            'incidenc',
            'status'] # got rid of 'CardSympTimeOfAdm' and 'CardSympTimeOfSurg' b/c not applicable to 2.73

In [59]:
replacement_dicts_alpha = [{1: 'NONE',
                            2: '1_CORONARY',
                            3: '2_CORONARIES',
                            4: '3_CORONARIES',
                            np.nan: 'NONE'}, #numdisv recooded from binary to multi-level per 10-26 GUIDE
                                             #will now result in 4 dummies (3 + 1 reference class)
                                             # +3 net features in ALL vs 10-24, TREE feature count unchanged
    
                           {1: 'NONE', 
                            2: 'STRENUOUS_ACTIVITY', # RECODED FROM `SLIGHT` PER 10-26
                            3: 'SLIGHT_LIMITATION_ACTIVITY', # RECODED FROM `SLIGHT` PER 10-26
                            4: 'MARKED_LIMITATION_ACTIVITY', # RECODED FROM `REST` PER 10-26
                            5: 'ANGINA_AT_REST', # RECODED FROM `REST` PER 10-26
                            np.nan: 'NONE'}, #anginalclass -- will now result in 5 dummies (4 + 1 ref class)
                                             # +2 net features in ALL vs 10-24, TREE feature count unchanged
                           
                           {1: 'NONE', 
                            2: 'SLIGHT_LIMITATION', # RECODED FROM `SLIGHT` PER 10-26 
                            3: 'MARKED_LIMITATION', # RECODED FROM `SLIGHT` PER 10-26 
                            4: 'ANY_ACTIVITY', # RECODED FROM `REST` PER 10-26
                            np.nan: 'NONE'}, #classnyh - will now result in 4 dummies (3 + 1 ref class)
                                             # +1 net features in ALL vs 10-24, TREE feature count unchanged
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsufm
                           
                           {0: 'NONE', 
                            1: 'TRIVIAL', 
                            2: 'MILD', 
                            3: 'MODERATE',
                            4: 'SEVERE',
                            5: 'NONE',
                            np.nan: 'NONE'}, #vdinsuft
                           
                           {1: 'NONE', 
                            2: 'FIRST', 
                            3: 'SECOND', 
                            4: 'THIRD',
                            5: 'FOURTH', 
                            np.nan: 'NONE'}, #incidenc -- NEED TO RENAME incidence_REOP
                           
                           {1: 'NONE', 
                            2: 'URGENT', 
                            3: 'EMERGENCY', 
                            4: 'SALVAGE',
                            np.nan: 'NONE'}] #status

- going to work on a subset of `pre_op_data`

In [60]:
pre_op_data.head(1)

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,carshock_orig,resusc_orig,Tobacco_Combined
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,,,,,1.0,0.0,1.0,0,0.0,1.0,4.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,1,0.014,2,2,2,0,2.0,2.0,1.0


In [61]:
pre_op_data.shape

(42740, 86)

In [62]:
recode_D_df = pre_op_data.copy()[recode_D]

In [63]:
recode_D_df.shape

(42740, 7)

In [64]:
name_replacement_zip = list(zip(recode_D, replacement_dicts_alpha))

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D`

In [65]:
for column, dictionary in name_replacement_zip:
    recode_D_df[column] = recode_D_df[column].replace(dictionary)

- checking recoding against original in `orig_pre`

In [66]:
feature_name = []
orig_coding = []
new_coding = []
dtypes = []

for column in recode_D:
    feature_name.append(column)
    orig_coding.append(sorted(orig_pre[column].unique()))
    new_coding.append(sorted(recode_D_df[column].unique()))
    dtypes.append(recode_D_df[column].dtype)

In [67]:
pd.DataFrame(list(zip(feature_name, orig_coding, new_coding, dtypes)),
             columns = ['feature', 'original_levels', 'new_levels', 'dtype'])

Unnamed: 0,feature,original_levels,new_levels,dtype
0,numdisv,"[1.0, 2.0, 3.0, 4.0, nan]","[1_CORONARY, 2_CORONARIES, 3_CORONARIES, NONE]",object
1,anginalclass,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[ANGINA_AT_REST, MARKED_LIMITATION_ACTIVITY, NONE, SLIGHT_LIMITATION_ACTIVITY, STRENUOUS_ACTIVITY]",object
2,classnyh,"[nan, 1.0, 2.0, 3.0, 4.0]","[ANY_ACTIVITY, MARKED_LIMITATION, NONE, SLIGHT_LIMITATION]",object
3,vdinsufm,"[0.0, 1.0, 2.0, 3.0, 4.0, nan, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]",object
4,vdinsuft,"[2.0, 3.0, nan, 0.0, 1.0, 4.0, 5.0]","[MILD, MODERATE, NONE, SEVERE, TRIVIAL]",object
5,incidenc,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[FIRST, FOURTH, NONE, SECOND, THIRD]",object
6,status,"[1.0, 2.0, 3.0, nan, 4.0]","[EMERGENCY, NONE, SALVAGE, URGENT]",object


#### Validating `recode_D_df`

In [68]:
recode_D_df.head()

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidenc,status
0,3_CORONARIES,MARKED_LIMITATION_ACTIVITY,NONE,SEVERE,MILD,NONE,NONE
1,3_CORONARIES,ANGINA_AT_REST,NONE,MODERATE,MILD,NONE,EMERGENCY
2,3_CORONARIES,NONE,NONE,MODERATE,MODERATE,NONE,URGENT
3,1_CORONARY,NONE,NONE,SEVERE,MILD,NONE,NONE
4,3_CORONARIES,ANGINA_AT_REST,NONE,NONE,NONE,NONE,URGENT


In [69]:
print (len(recode_D))
print (recode_D_df.shape)
print (pre_op_data.shape)

7
(42740, 7)
(42740, 86)


- renaming `incidenc` to `incidencREOP` for the `recode_D_df` for use in `Decision Trees`

In [70]:
recode_D_df = recode_D_df.rename(columns={'incidenc': 'incidencREOP'})

In [71]:
recode_D_df.head(1)

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,3_CORONARIES,MARKED_LIMITATION_ACTIVITY,NONE,SEVERE,MILD,NONE,NONE


### Now creating `recode_D_sklearn` for use in `sklearn` `Decision Trees`
- transform `text` codes to `numeric` for use in `sklearn` ML algos

In [72]:
recode_D_sklearn = recode_D_df.copy()

In [73]:
recode_D_df.shape, recode_D_sklearn.shape

((42740, 7), (42740, 7))

In [74]:
recode_D_sklearn.head(2)

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,3_CORONARIES,MARKED_LIMITATION_ACTIVITY,NONE,SEVERE,MILD,NONE,NONE
1,3_CORONARIES,ANGINA_AT_REST,NONE,MODERATE,MILD,NONE,EMERGENCY


- recoding `numdisv`

In [75]:
recode_D_sklearn['numdisv'].value_counts()

3_CORONARIES    25071
2_CORONARIES     7608
NONE             6828
1_CORONARY       3233
Name: numdisv, dtype: int64

In [76]:
recode_D_sklearn = recode_D_sklearn.replace({'numdisv': {'NONE': 0,
                                                         '1_CORONARY': 1,
                                                         '2_CORONARIES': 2,
                                                         '3_CORONARIES': 3}})

In [77]:
recode_D_sklearn['numdisv'].value_counts()

3    25071
2     7608
0     6828
1     3233
Name: numdisv, dtype: int64

- recoding `anginalclass`

In [78]:
recode_D_sklearn['anginalclass'].value_counts()

NONE                          13470
MARKED_LIMITATION_ACTIVITY    11988
ANGINA_AT_REST                 9915
SLIGHT_LIMITATION_ACTIVITY     5806
STRENUOUS_ACTIVITY             1561
Name: anginalclass, dtype: int64

In [79]:
recode_D_sklearn = recode_D_sklearn.replace({'anginalclass': {'NONE': 0, 
                                                              'STRENUOUS_ACTIVITY': 1, 
                                                              'SLIGHT_LIMITATION_ACTIVITY': 2,
                                                              'MARKED_LIMITATION_ACTIVITY': 3, 
                                                              'ANGINA_AT_REST': 4}})

In [80]:
recode_D_sklearn['anginalclass'].value_counts()

0    13470
3    11988
4     9915
2     5806
1     1561
Name: anginalclass, dtype: int64

- recoding `classnyh`

In [81]:
recode_D_sklearn['classnyh'].value_counts()

NONE                 34835
MARKED_LIMITATION     3604
ANY_ACTIVITY          2258
SLIGHT_LIMITATION     2043
Name: classnyh, dtype: int64

In [82]:
recode_D_sklearn = recode_D_sklearn.replace({'classnyh': {'NONE': 0, 
                                                          'SLIGHT_LIMITATION': 1, 
                                                          'MARKED_LIMITATION': 2, 
                                                          'ANY_ACTIVITY': 3}})

In [83]:
recode_D_sklearn['classnyh'].value_counts()

0    34835
2     3604
3     2258
1     2043
Name: classnyh, dtype: int64

- recoding `vdinsufm`

In [84]:
recode_D_sklearn['vdinsufm'].value_counts()

NONE        15876
MILD        10695
TRIVIAL      9276
SEVERE       3528
MODERATE     3365
Name: vdinsufm, dtype: int64

In [85]:
recode_D_sklearn = recode_D_sklearn.replace({'vdinsufm': {'NONE': 0,
                                                          'TRIVIAL': 1,
                                                          'MILD': 2,
                                                          'MODERATE': 3,
                                                          'SEVERE': 4}})

In [86]:
recode_D_sklearn['vdinsufm'].value_counts()

0    15876
2    10695
1     9276
4     3528
3     3365
Name: vdinsufm, dtype: int64

- recoding `vdinsuft`

In [87]:
recode_D_sklearn['vdinsuft'].value_counts()

NONE        18166
TRIVIAL     11418
MILD        10705
MODERATE     2212
SEVERE        239
Name: vdinsuft, dtype: int64

In [88]:
recode_D_sklearn = recode_D_sklearn.replace({'vdinsuft': {'NONE': 0,
                                                          'TRIVIAL': 1,
                                                          'MILD': 2,
                                                          'MODERATE': 3,
                                                          'SEVERE': 4}})

In [89]:
recode_D_sklearn['vdinsuft'].value_counts()

0    18166
1    11418
2    10705
3     2212
4      239
Name: vdinsuft, dtype: int64

- recoding `incidencREOP`

In [90]:
recode_D_sklearn['incidencREOP'].value_counts()

NONE      40408
FIRST      2174
SECOND      130
THIRD        19
FOURTH        9
Name: incidencREOP, dtype: int64

In [91]:
recode_D_sklearn = recode_D_sklearn.replace({'incidencREOP': {'NONE': 0,
                                                              'FIRST': 1,
                                                              'SECOND': 2,
                                                              'THIRD': 3,
                                                              'FOURTH': 4}})

In [92]:
recode_D_sklearn['incidencREOP'].value_counts()

0    40408
1     2174
2      130
3       19
4        9
Name: incidencREOP, dtype: int64

- recoding `status`

In [93]:
recode_D_sklearn['status'].value_counts()

NONE         21016
URGENT       20731
EMERGENCY      966
SALVAGE         27
Name: status, dtype: int64

In [94]:
recode_D_sklearn = recode_D_sklearn.replace({'status': {'NONE': 0,
                                                        'URGENT': 1,
                                                        'EMERGENCY': 2,
                                                        'SALVAGE': 3}})

In [95]:
recode_D_sklearn['status'].value_counts()

0    21016
1    20731
2      966
3       27
Name: status, dtype: int64

- checking `dtypes`

In [96]:
recode_D_sklearn.dtypes

numdisv         int64
anginalclass    int64
classnyh        int64
vdinsufm        int64
vdinsuft        int64
incidencREOP    int64
status          int64
dtype: object

In [97]:
recode_D_sklearn.head()

Unnamed: 0,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status
0,3,3,0,4,2,0,0
1,3,4,0,3,2,0,2
2,3,0,0,3,3,0,1
3,1,0,0,4,2,0,0
4,3,4,0,0,0,0,1


In [98]:
pre_op_data.shape, recode_D_df.shape, recode_D_sklearn.shape

((42740, 86), (42740, 7), (42740, 7))

### Creating Dummy Variables from Recoded Features - `recode_D_Dummies`

#### Now creating `dummies`

- applying `pd.get_dummies()`

In [99]:
recode_D_Dummies = pd.get_dummies(recode_D_df.copy())

In [100]:
recode_D_Dummies.head()

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,numdisv_NONE,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_NONE,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_NONE,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_NONE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_NONE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_NONE,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_NONE,status_SALVAGE,status_URGENT
0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


In [101]:
print (recode_D_df.shape)
print (recode_D_Dummies.shape)

(42740, 7)
(42740, 32)


- now need to eliminate reference classes
- identifying nan columns to drop

In [102]:
drop_cols = [col for col in recode_D_Dummies.columns if col.endswith('_NONE')]

In [103]:
drop_cols

['numdisv_NONE',
 'anginalclass_NONE',
 'classnyh_NONE',
 'vdinsufm_NONE',
 'vdinsuft_NONE',
 'incidencREOP_NONE',
 'status_NONE']

In [104]:
len(drop_cols)

7

- dropping the columns

In [105]:
recode_D_Dummies = recode_D_Dummies.drop(drop_cols, axis=1)

In [106]:
recode_D_Dummies.head()

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_ANGINA_AT_REST,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_STRENUOUS_ACTIVITY,classnyh_ANY_ACTIVITY,classnyh_MARKED_LIMITATION,classnyh_SLIGHT_LIMITATION,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,vdinsuft_TRIVIAL,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_SALVAGE,status_URGENT
0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [107]:
recode_D_Dummies.shape

(42740, 25)

- reordering the columns for readability

In [108]:
new_col_order = ['numdisv_1_CORONARY',
                 'numdisv_2_CORONARIES',
                 'numdisv_3_CORONARIES',
                 
                 'anginalclass_STRENUOUS_ACTIVITY',
                 'anginalclass_SLIGHT_LIMITATION_ACTIVITY',
                 'anginalclass_MARKED_LIMITATION_ACTIVITY',
                 'anginalclass_ANGINA_AT_REST',
                 
                 'classnyh_SLIGHT_LIMITATION',
                 'classnyh_MARKED_LIMITATION',
                 'classnyh_ANY_ACTIVITY',
    
                 'vdinsufm_TRIVIAL',
                 'vdinsufm_MILD',
                 'vdinsufm_MODERATE',
                 'vdinsufm_SEVERE',

                 'vdinsuft_TRIVIAL',
                 'vdinsuft_MILD',
                 'vdinsuft_MODERATE',
                 'vdinsuft_SEVERE',

                 'incidencREOP_FIRST',
                 'incidencREOP_SECOND',
                 'incidencREOP_THIRD',
                 'incidencREOP_FOURTH',

                 'status_URGENT',
                 'status_EMERGENCY',
                 'status_SALVAGE']

In [109]:
len(new_col_order)

25

- reordering columns
- syntax tip: if manually specifically column order instead of passing a list `df[['col_a', 'col_c', 'col_b']]`

In [110]:
recode_D_Dummies = recode_D_Dummies[new_col_order]

In [111]:
recode_D_Dummies.head()

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE
0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [112]:
recode_D_Dummies.shape

(42740, 25)

### Recoding `recode_D_P` Features - Dropping Reference Class when Create Dummies

In [113]:
recode_D_P = ['cvdcarsten',
              'cvdstenrt',   # will be recoded per 10-26 GUIDE
              'cvdstenlft']  # will be recoded per 10-26 GUIDE
              #'arrhythwhen' # prior version combined `ArrhythAFibDur` and `arrhythwhen`
                             # 10-26 GUIDE discarded both features

- did not recode the following features in this iteration due to all `NaN`s in `2.73` and no way to harmonize them between `2.73` and `2.81`
- the following features were discarded as a result as of the 10-26 GUIDE
- `ArrhythAFlutter`
- `ArrhythAFib`
- `ArrhythAFibDur`
- `arrhythwhen`

#### Defining `replacement_dicts` for `recode_D_P` Features

In [114]:
replacement_dicts_alpha = [{1: 'NONE', 
                            2: 'RIGHT', 
                            3: 'LEFT', 
                            4: 'BOTH',
                            np.nan: 'NONE'}, #cvdcarsten
                           
                           {1: '80-99%', 
                            2: '100%', 
                            #3: '50%-79%', # `3` NOW MAPS TO `NONE` PER 10-26, was `50%-79%` in 10-24
                            3: 'NONE', # NEW CODING PER 10-26 GUIDE
                            4: 'NONE',
                            np.nan: 'NONE'}, #cvdstenrt - net change of 1 feature in ALL vs 24, no change TREE
                           
                           {1: '80-99%', 
                            2: '100%', 
                            #3: '50%-79%', # `3` NOW MAPS TO `NONE` PER 10-26, was `50%-79%` in 10-24
                            3: 'NONE', # NEW CODING PER 10-26 GUIDE
                            4: 'NONE',
                            np.nan: 'NONE'}] #cvdstenlft - net change of 1 feature in ALL vs 24, no change TREE
                           
                           #{1: 'SHORT', 
                           #2: 'LONG', 
                           #3: 'NONE', 
                           #0: 'NONE'}] #ArrhythDur_when_Combo - DISCARDED PER 10-26 GUIDE
                                        #reduction in 3 dummies (2 + 1 ref class)
                                        # net -2 features in ALL vs 10-24, TREE version -1

In [115]:
pre_op_data.head(1)

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,TobaccoUse,cigsmoker,chrlungd,hmo2,slpapn,ivdrugab,alcohol,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdcarsten,cvdstenrt,cvdstenlft,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,chf,classnyh,priorhf,carshock,resusc,Arrhythmia,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen,arrhyafib,medasa,medaplt5days,medinotr,medlipid,numdisv,hdefd,vdaort,vdstena,vdinsufm,vdstenm,vdinsuft,incidenc,status,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,carshock_orig,resusc_orig,Tobacco_Combined
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,2011-07-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,,,4.0,0.0,,0.0,0.0,0.0,,,,,1.0,0.0,1.0,0,0.0,1.0,4.0,1.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,1,0.014,2,2,2,0,2.0,2.0,1.0


In [116]:
pre_op_data.shape

(42740, 86)

#### Replacing `arrhythwhen` with `ArrhythDur_when_Combo` in `recode_D_P`

In [117]:
recode_D_P = ['cvdcarsten',
              'cvdstenrt',
              'cvdstenlft']
              #'ArrhythDur_when_Combo'] # DISCARDED PER 10-26 GUIDE

- creating subset of features `recode_D_P`

In [118]:
recode_D_P_df = pre_op_data.copy()[recode_D_P]

In [119]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,,,
1,,,
2,1.0,,
3,,,
4,,,


In [120]:
recode_D_P_df.shape

(42740, 3)

- `zip`ping together `recode_D_P` feature list and `replacement_dicts_alpha`

In [121]:
name_replacement_zip = list(zip(recode_D_P, replacement_dicts_alpha))

- iterate through `name_replacement_zip` and apply `replacement_dicts_alpha` to features in `recode_D_P`

In [122]:
for column, dictionary in name_replacement_zip:
    recode_D_P_df[column] = recode_D_P_df[column].replace(dictionary)

#### Validating `recode_D_P_df`

In [123]:
recode_D_P_df.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,NONE,NONE,NONE
1,NONE,NONE,NONE
2,NONE,NONE,NONE
3,NONE,NONE,NONE
4,NONE,NONE,NONE


In [124]:
print (len(recode_D_P))
print (recode_D_P_df.shape)
print (pre_op_data.shape)

3
(42740, 3)
(42740, 86)


### Now creating `recode_D_P_sklearn` for use in `sklearn` `Decision Trees`
- transform `text` codes to `numeric` for use in `sklearn` ML algos

In [125]:
recode_D_P_sklearn = recode_D_P_df.copy()

In [126]:
recode_D_P_df.shape, recode_D_P_sklearn.shape

((42740, 3), (42740, 3))

In [127]:
recode_D_P_sklearn.head(2)

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,NONE,NONE,NONE
1,NONE,NONE,NONE


- recoding `cvdcarsten`

In [128]:
recode_D_P_sklearn['cvdcarsten'].value_counts()

NONE     39666
RIGHT     1116
LEFT      1056
BOTH       902
Name: cvdcarsten, dtype: int64

In [129]:
recode_D_P_sklearn = recode_D_P_sklearn.replace({'cvdcarsten': {'NONE': 0,
                                                                'RIGHT': 1,
                                                                'LEFT': 2,
                                                                'BOTH': 3}})

In [130]:
recode_D_P_sklearn['cvdcarsten'].value_counts()

0    39666
1     1116
2     1056
3      902
Name: cvdcarsten, dtype: int64

- recoding `cvdstenrt`

In [131]:
recode_D_P_sklearn['cvdstenrt'].value_counts()

NONE      42134
80-99%      403
100%        203
Name: cvdstenrt, dtype: int64

In [132]:
recode_D_P_sklearn = recode_D_P_sklearn.replace({'cvdstenrt': {'NONE': 0,
                                                               '80-99%': 1,
                                                               '100%': 2}})

In [133]:
recode_D_P_sklearn['cvdstenrt'].value_counts()

0    42134
1      403
2      203
Name: cvdstenrt, dtype: int64

- recoding `cvdstenlft`

In [134]:
recode_D_P_sklearn['cvdstenlft'].value_counts()

NONE      42209
80-99%      363
100%        168
Name: cvdstenlft, dtype: int64

In [135]:
recode_D_P_sklearn = recode_D_P_sklearn.replace({'cvdstenlft': {'NONE': 0,
                                                                '80-99%': 1,
                                                                '100%': 2}})

In [136]:
recode_D_P_sklearn['cvdstenlft'].value_counts()

0    42209
1      363
2      168
Name: cvdstenlft, dtype: int64

- checking `dtypes`

In [137]:
recode_D_P_sklearn.dtypes

cvdcarsten    int64
cvdstenrt     int64
cvdstenlft    int64
dtype: object

In [138]:
recode_D_P_sklearn.head()

Unnamed: 0,cvdcarsten,cvdstenrt,cvdstenlft
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [139]:
pre_op_data.shape, recode_D_P_df.shape, recode_D_P_sklearn.shape

((42740, 86), (42740, 3), (42740, 3))

### Creating Dummy Variables from Recoded Features - `recode_D_P_Dummies`

#### Now creating `dummies`
- applying `pd.get_dummies()`

In [140]:
recode_D_P_Dummies = pd.get_dummies(recode_D_P_df.copy())

In [141]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_BOTH,cvdcarsten_LEFT,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdstenrt_100%,cvdstenrt_80-99%,cvdstenrt_NONE,cvdstenlft_100%,cvdstenlft_80-99%,cvdstenlft_NONE
0,0,0,1,0,0,0,1,0,0,1
1,0,0,1,0,0,0,1,0,0,1
2,0,0,1,0,0,0,1,0,0,1
3,0,0,1,0,0,0,1,0,0,1
4,0,0,1,0,0,0,1,0,0,1


In [142]:
recode_D_P_Dummies.shape

(42740, 10)

#### Getting Rid of Reference Classes
- identifying `NaN` or `NONE` columns to drop

In [143]:
drop_cols = [col for col in recode_D_P_Dummies.columns if col.endswith('_NONE')]

In [144]:
drop_cols

['cvdcarsten_NONE', 'cvdstenrt_NONE', 'cvdstenlft_NONE']

In [145]:
len(drop_cols)

3

In [146]:
recode_D_P_Dummies.shape

(42740, 10)

- dropping the columns

In [147]:
recode_D_P_Dummies = recode_D_P_Dummies.drop(drop_cols, axis=1)

In [148]:
recode_D_P_Dummies.shape

(42740, 7)

- reordering columns for readability

In [149]:
new_col_order = ['cvdcarsten_RIGHT',
                 'cvdcarsten_LEFT', 
                 'cvdcarsten_BOTH',
                 
                 #'cvdstenrt_50%-79%', # RECODED TO `NONE` OR REFERENCE CLASS PER 10-26 GUIDE
                 'cvdstenrt_80-99%',
                 'cvdstenrt_100%',
                 
                 #'cvdstenlft_50%-79%', # RECODED TO `NONE` OR REFERENCE CLASS PER 10-26 GUIDE
                 'cvdstenlft_80-99%',
                 'cvdstenlft_100%']

                 #'ArrhythDur_when_Combo_SHORT', # DISCARDED PER 10-26 GUIDE
                 #'ArrhythDur_when_Combo_LONG']  # DISCARDED PER 10-26 GUIDE

In [150]:
len(new_col_order)

7

- reordering columns

In [151]:
recode_D_P_Dummies = recode_D_P_Dummies[new_col_order]

In [152]:
recode_D_P_Dummies.head()

Unnamed: 0,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [153]:
recode_D_P_Dummies.shape

(42740, 7)

### `datetime` Features

In [154]:
date_features

['surgdt']

In [155]:
dates_df = pre_op_data.copy()[date_features]

In [156]:
dates_df.head()

Unnamed: 0,surgdt
0,2011-07-01
1,2011-07-02
2,2011-07-04
3,2011-07-05
4,2011-07-06


In [157]:
dates_df.shape

(42740, 1)

#### Extracting additional features from `surgdt`

In [158]:
def date_components(data, col_labels):
    '''this function extracts date components from datetime objecr and recenters them
       where appropriate
    '''
    dates_frame = data.apply(lambda x: pd.Series([x.month,
                                                  x.day,
                                                  x.weekday()]))
    dates_frame.columns = col_labels
        
    return dates_frame

In [159]:
surgdt_col_labels = ['surgdt_month',
                     'surgdt_DayOfMonth',
                     'surgdt_DayOfWeek']

In [160]:
surgdt_features = date_components(dates_df['surgdt'], surgdt_col_labels)

In [161]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek
0,7,1,4
1,7,2,5
2,7,4,0
3,7,5,1
4,7,6,2


- checking unique values created by `date_components` function

In [162]:
sorted(surgdt_features['surgdt_month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [163]:
sorted(surgdt_features['surgdt_DayOfMonth'].unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31]

- according to the `datetime` documentation `Monday` is coded as `0` and `Sunday` as a `6`

In [164]:
sorted(surgdt_features['surgdt_DayOfWeek'].unique())

[0, 1, 2, 3, 4, 5, 6]

- going to `bin` `surgdt_DayOfMonth`

In [165]:
bins = [0, 10, 20, np.inf]
names = [1, 2, 3]

In [166]:
surgdt_features['surgdt_PartOfMonth'] = pd.cut(surgdt_features['surgdt_DayOfMonth'],
                                               bins,
                                               labels=names)

In [167]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfMonth,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,1,4,1
1,7,2,5,1
2,7,4,0,1
3,7,5,1,1
4,7,6,2,1


In [168]:
surgdt_features.shape

(42740, 4)

- dropping `surgdt_DayOfMonth` since we recoded by binning to create `surgdt_PartOfMonth`

In [169]:
surgdt_features = surgdt_features.drop('surgdt_DayOfMonth', axis=1)

In [170]:
surgdt_features.shape

(42740, 3)

#### Creating `surgdt_features_sklearn` 
- `surgdt_features` is currently in a format that can be used by `sklearn` in `Decision Trees`
- making a copy for use by `sklearn` `Decision Trees`

In [171]:
surgdt_features_sklearn = surgdt_features.copy()

In [172]:
surgdt_features_sklearn.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,4,1
1,7,5,1
2,7,0,1
3,7,1,1
4,7,2,1


In [173]:
surgdt_features_sklearn.shape

(42740, 3)

#### Now recoding `surgdt_features` in preparation to use `pd.get_dummies()`

In [174]:
surgdt_features.head(2)

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,4,1
1,7,5,1


In [175]:
surgdt_features.shape

(42740, 3)

In [176]:
weekday_dict = {0: "Mon",
                1: "Tues",
                2: "Wed",
                3: "Thurs",
                4: "Fri",
                5: "Sat",
                6: "Sun"}

- going to `dummy` code `DayOfWeek`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [177]:
surgdt_features = surgdt_features.replace({'surgdt_DayOfWeek': weekday_dict})

In [178]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,7,Fri,1
1,7,Sat,1
2,7,Mon,1
3,7,Tues,1
4,7,Wed,1


In [179]:
sorted(surgdt_features['surgdt_month'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [180]:
month_dict = {1: "Jan",
              2: "Feb",
              3: "Mar",
              4: "Apr",
              5: "May",
              6: "Jun",
              7: "Jul",
              8: "Aug",
              9: "Sep",
              10: "Oct",
              11: "Nov",
              12: "Dec"}

- going to `dummy` code `surgdt_month`
- replacing numerical values with text using `df.replace({'col_name': replacement_dict})`

In [181]:
surgdt_features = surgdt_features.replace({'surgdt_month': month_dict})

In [182]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,Jul,Fri,1
1,Jul,Sat,1
2,Jul,Mon,1
3,Jul,Tues,1
4,Jul,Wed,1


In [183]:
print (surgdt_features['surgdt_DayOfWeek'].unique())
print (surgdt_features['surgdt_month'].unique())

['Fri' 'Sat' 'Mon' 'Tues' 'Wed' 'Thurs' 'Sun']
['Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec' 'Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun']


- recoding `surgdt_PartOfMonth` back to text for `pd.get_dummies()`

In [184]:
surgdt_features = surgdt_features.replace({'surgdt_PartOfMonth': {1: 'Beg',
                                                                  2: 'Mid',
                                                                  3: 'End'}})

In [185]:
print (surgdt_features['surgdt_PartOfMonth'].unique())

['Beg' 'Mid' 'End']


In [186]:
surgdt_features.head()

Unnamed: 0,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth
0,Jul,Fri,Beg
1,Jul,Sat,Beg
2,Jul,Mon,Beg
3,Jul,Tues,Beg
4,Jul,Wed,Beg


In [187]:
surgdt_features.shape

(42740, 3)

#### Applying `pd.get_dummies` to `surgdt_features`

In [188]:
surgdt_dummies = pd.get_dummies(surgdt_features.copy())

In [189]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Jun,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,surgdt_PartOfMonth_Mid
0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0


In [190]:
surgdt_dummies.shape

(42740, 22)

- need to drop reference classes for `surgdt_month`, `surgdt_DayOfWeek` and `surgdt_PartOfMonth`
- can pick any month, day of week or Part of Month as the reference class
- chose to pick the middle month (June), day of week (Wed) and Part of Month (Mid) given a working hypothesis that most action is around the beginning and end of time periods

In [191]:
drop_cols = ['surgdt_month_Jun', 'surgdt_DayOfWeek_Wed', 'surgdt_PartOfMonth_Mid']

In [192]:
len(drop_cols)

3

In [193]:
surgdt_dummies = surgdt_dummies.drop(drop_cols, axis=1)

In [194]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Apr,surgdt_month_Aug,surgdt_month_Dec,surgdt_month_Feb,surgdt_month_Jan,surgdt_month_Jul,surgdt_month_Mar,surgdt_month_May,surgdt_month_Nov,surgdt_month_Oct,surgdt_month_Sep,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Tues,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [195]:
surgdt_dummies.shape

(42740, 19)

- reordering columns for readability

In [196]:
new_col_order = ['surgdt_month_Jan',
                 'surgdt_month_Feb',
                 'surgdt_month_Mar',
                 'surgdt_month_Apr',
                 'surgdt_month_May',
                 'surgdt_month_Jul',
                 'surgdt_month_Aug',
                 'surgdt_month_Sep',
                 'surgdt_month_Oct',
                 'surgdt_month_Nov',
                 'surgdt_month_Dec',

                 'surgdt_DayOfWeek_Mon',
                 'surgdt_DayOfWeek_Tues',
                 'surgdt_DayOfWeek_Thurs',
                 'surgdt_DayOfWeek_Fri',
                 'surgdt_DayOfWeek_Sat',
                 'surgdt_DayOfWeek_Sun',

                 'surgdt_PartOfMonth_Beg',
                 'surgdt_PartOfMonth_End']

In [197]:
surgdt_dummies = surgdt_dummies[new_col_order]

In [198]:
surgdt_dummies.head()

Unnamed: 0,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [199]:
surgdt_dummies.shape

(42740, 19)

### Numerical Features
- creating `numerical_feature_df`

In [200]:
numerical_features_df = pre_op_data.copy()[num_features]

In [201]:
numerical_features_df.head()

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,45.0,1.2,,,,55.0,40.0
2,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0
3,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0
4,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0


In [202]:
numerical_features_df.shape

(42740, 10)

- creating `bmi` numerical feature
- BMI is weight in kilograms (`weightkg`) divided by height in meters squared `(heightcm/100)^2)`

In [203]:
numerical_features_df['bmi'] = numerical_features_df['weightkg'] / np.power((numerical_features_df['heightcm']/100), 
                                                                            2)

In [204]:
numerical_features_df.head()

Unnamed: 0,age,heightcm,weightkg,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,bmi
0,54,180.0,117.0,43.0,0.9,3.8,7.2,6.5,47.0,42.0,36.11111
1,65,175.3,79.4,45.0,1.2,,,,55.0,40.0,25.83787
2,83,162.60001,102.1,29.0,1.2,3.3,6.2,8.6,60.0,36.0,38.61754
3,59,160.0,127.5,35.0,0.9,3.5,7.4,6.4,60.0,35.0,49.80469
4,72,160.0,64.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,25.0


In [205]:
numerical_features_df.shape

(42740, 11)

- reordering columns for readability

In [206]:
new_col_order = ['age',
                 'heightcm',
                 'weightkg',
                 'bmi',
                 'hct',
                 'creatlst',
                 'totalbumin',
                 'a1clvl',
                 'meldscr',
                 'hdef',
                 'pasys']

In [207]:
numerical_features_df = numerical_features_df[new_col_order]

In [208]:
numerical_features_df.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0


In [209]:
numerical_features_df.shape

(42740, 11)

In [210]:
numerical_features_df.shape

(42740, 11)

### `outcome_other`
- creating `outcome_other_df`

In [211]:
outcome_other_df = pre_op_data.copy()[outcome_other]

In [212]:
outcome_other_df.head()

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin
0,1,0.014,2,2,2,0
1,2,0.017,2,2,1,0
2,3,0.045,2,2,1,0
3,4,0.013,2,2,1,0
4,5,0.016,2,2,2,0


- creating `strokeBin2` which is a more inclusive definition of stroke that incorporates `cnstrokttia`

In [213]:
outcome_other_df['cnstrokttia'].unique()

array([2, 1])

- checking for `NaN`s

In [214]:
outcome_other_df['cnstrokttia'].isnull().sum()

0

- recoding `cnstrokttia`, where `1 == 1` and `2 == 0`

In [215]:
outcome_other_df['cnstrokttia'] = outcome_other_df['cnstrokttia'].replace({1: 1, 2: 0})

- validating

In [216]:
outcome_other_df.head()

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin
0,1,0.014,2,0,2,0
1,2,0.017,2,0,1,0
2,3,0.045,2,0,1,0
3,4,0.013,2,0,1,0
4,5,0.016,2,0,2,0


In [217]:
outcome_other_df['cnstrokttia'].unique()

array([0, 1])

In [218]:
outcome_other_df['cnstrokttia'].isnull().sum()

0

- doing some analysis on `strokeBin` and `cnstrokttia`

- stroke incidence rate

In [219]:
outcome_other_df['strokeBin'].sum()

617

In [220]:
outcome_other_df['strokeBin'].sum() / outcome_other_df.shape[0] * 100

1.4436125409452505

- ttia incidence rate

In [221]:
outcome_other_df['cnstrokttia'].sum()

116

In [222]:
outcome_other_df['cnstrokttia'].sum() / outcome_other_df.shape[0] * 100

0.271408516612073

#### How Often Do `strokeBin` and `cnstrokttia` Overlap?

In [223]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 1) & (outcome_other_df['strokeBin'] == 1)].shape

(5, 6)

In [224]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 0) & (outcome_other_df['strokeBin'] == 0)].shape

(42012, 6)

In [225]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 1) & (outcome_other_df['strokeBin'] == 0)].shape

(111, 6)

In [226]:
outcome_other_df[(outcome_other_df['cnstrokttia'] == 0) & (outcome_other_df['strokeBin'] == 1)].shape

(612, 6)

- `strokeBin2`

In [227]:
617 + 111

728

In [228]:
outcome_other_df['strokeBin2'] = outcome_other_df['strokeBin'] + outcome_other_df['cnstrokttia']

In [229]:
outcome_other_df['strokeBin2'].unique()

array([0, 1, 2])

In [230]:
outcome_other_df['strokeBin2'].value_counts()

0    42012
1      723
2        5
Name: strokeBin2, dtype: int64

- recoding cases where `strokeBin` and `cnstrokttia` are both equal to `1`

In [231]:
outcome_other_df['strokeBin2'] = outcome_other_df['strokeBin2'].replace({0: 0,
                                                                         1: 1, 
                                                                         2: 1})

- validating

In [232]:
outcome_other_df['strokeBin2'].unique()

array([0, 1])

In [233]:
outcome_other_df['strokeBin2'].value_counts()

0    42012
1      728
Name: strokeBin2, dtype: int64

- `strokeBin2` incidence rate

In [234]:
outcome_other_df['strokeBin2'].sum() / outcome_other_df.shape[0] * 100

1.7033224145999064

- final validation of `outcome_other_df`

In [235]:
outcome_other_df.head()

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,1,0.014,2,0,2,0,0
1,2,0.017,2,0,1,0,0
2,3,0.045,2,0,1,0,0
3,4,0.013,2,0,1,0,0
4,5,0.016,2,0,2,0,0


In [236]:
outcome_other_df.shape

(42740, 7)

## Assembling the Pre-Op Data Set
- `numerical_features_df`

In [237]:
numerical_features_df.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0


In [238]:
numerical_features_df.shape

(42740, 11)

- `surgdt_features_df`

In [239]:
surgdt_dummies.head(2)

Unnamed: 0,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0


In [240]:
surgdt_dummies.shape

(42740, 19)

- `yes_no_unc_df`

In [241]:
yes_no_unc_df = pre_op_data.copy()[yes_no_unc]

In [242]:
yes_no_unc_df.head(2)

Unnamed: 0,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [243]:
print (yes_no_unc_df.shape)
print (len(yes_no_unc))

(42740, 36)
36


- `compress_to_two_df`

In [244]:
compress_to_two_df = pre_op_data.copy()[compress_to_two]

In [245]:
compress_to_two_df.head(2)

Unnamed: 0,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock,resusc,medasa,medaplt5days,medlipid
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0


In [246]:
compress_to_two_df.shape

(42740, 12)

- need to rename `carshock` and `resusc` to `carshock24` and `resusc24`

In [247]:
compress_to_two_df = compress_to_two_df.rename(columns={'carshock': 'carshock24',
                                                        'resusc': 'resusc24'})

In [248]:
compress_to_two_df.head(2)

Unnamed: 0,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0


In [249]:
compress_to_two_df.shape

(42740, 12)

- `recode_D_Dummies`

In [250]:
recode_D_Dummies.head(2)

Unnamed: 0,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE
0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


In [251]:
recode_D_Dummies.shape

(42740, 25)

- `recode_D_P_Dummies`

In [252]:
recode_D_P_Dummies.head(2)

Unnamed: 0,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0


In [253]:
recode_D_P_Dummies.shape

(42740, 7)

- `outcome_other_df`

In [254]:
outcome_other_df.head(2)

Unnamed: 0,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,1,0.014,2,0,2,0,0
1,2,0.017,2,0,1,0,0


In [255]:
outcome_other_df.shape

(42740, 7)

## Concatenating Pre-Op Data Set Components
### - Dataset with Dummies where reference class is dropped

In [256]:
PREOP_dataset = pd.concat((numerical_features_df,
                           surgdt_dummies,
                           yes_no_unc_df,
                           compress_to_two_df,
                           recode_D_Dummies,
                           recode_D_P_Dummies,
                           outcome_other_df),
                           axis=1)

In [257]:
PREOP_dataset.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv_1_CORONARY,numdisv_2_CORONARIES,numdisv_3_CORONARIES,anginalclass_STRENUOUS_ACTIVITY,anginalclass_SLIGHT_LIMITATION_ACTIVITY,anginalclass_MARKED_LIMITATION_ACTIVITY,anginalclass_ANGINA_AT_REST,classnyh_SLIGHT_LIMITATION,classnyh_MARKED_LIMITATION,classnyh_ANY_ACTIVITY,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_80-99%,cvdstenlft_100%,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.014,2,0,2,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0.017,2,0,1,0,0


#### Versus 10-24 Dataset Added `+1` Net New Columns (10/24 had `116` columns)

In [258]:
PREOP_dataset.shape

(42740, 117)

- validating

In [259]:
print (numerical_features_df.shape, 
       surgdt_dummies.shape,
       yes_no_unc_df.shape,
       compress_to_two_df.shape,
       recode_D_Dummies.shape,
       recode_D_P_Dummies.shape,
       outcome_other_df.shape)

(42740, 11) (42740, 19) (42740, 36) (42740, 12) (42740, 25) (42740, 7) (42740, 7)


In [260]:
numerical_features_df.shape[1] + surgdt_dummies.shape[1] + yes_no_unc_df.shape[1] + compress_to_two_df.shape[1] + recode_D_Dummies.shape[1] + recode_D_P_Dummies.shape[1] + outcome_other_df.shape[1]

117

### - `sklearn` `Decision Trees` Dataset

In [261]:
PREOP_dataset_sklearn = pd.concat((numerical_features_df,
                           surgdt_features_sklearn,
                           yes_no_unc_df,
                           compress_to_two_df,
                           recode_D_sklearn,
                           recode_D_P_sklearn,
                           outcome_other_df),
                           axis=1)

In [262]:
PREOP_dataset_sklearn.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0,1,0.014,2,0,2,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0,2,0.017,2,0,1,0,0


#### Versus 10-24 TREE Dataset Net `-1` Columns (10/24 had `80` columns)

In [263]:
PREOP_dataset_sklearn.shape

(42740, 79)

- validating

In [264]:
print (numerical_features_df.shape, 
       surgdt_features_sklearn.shape,
       yes_no_unc_df.shape,
       compress_to_two_df.shape,
       recode_D_sklearn.shape,
       recode_D_P_sklearn.shape,
       outcome_other_df.shape)

(42740, 11) (42740, 3) (42740, 36) (42740, 12) (42740, 7) (42740, 3) (42740, 7)


In [265]:
numerical_features_df.shape[1] + surgdt_features_sklearn.shape[1] + yes_no_unc_df.shape[1] + compress_to_two_df.shape[1] + recode_D_sklearn.shape[1] + recode_D_P_sklearn.shape[1] + outcome_other_df.shape[1]

79

### Pickling Final Files

In [266]:
#PREOP_dataset.to_pickle('PREOP_dataset_10_27.pkl')

#### For `sklearn` `DecisionTrees`

In [267]:
#PREOP_dataset_sklearn.to_pickle('PREOP_dataset_TREE_10_27.pkl')