## Capstone Project

### Pre-Processing Notebook - `PRE-OP TREE DATASET` from `10/27/19`

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.1
Running numpy version: 1.14.2
Running sklearn version: 0.21.3


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.DS_Store',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'PREOP_dataset_10_24.pkl',
 'PREOP_dataset_10_27.pkl',
 'PREOP_dataset_TREE_10_24.pkl',
 'PREOP_dataset_TREE_10_27.pkl',
 'X_A_DREF.pkl',
 'X_A_DREF_TREE_SKLEARN.pkl',
 'X_PREOP_10_24.pkl',
 'X_PREOP_10_27.pkl',
 'X_PREOP_TREE_10_24.pkl',
 'X_dev_A_DREF.pkl',
 'X_dev_A_DREF_TREE_SKLEARN.pkl',
 'X_dev_PREOP_10_24.pkl',
 'X_dev_PREOP_10_27.pkl',
 'X_dev_PREOP_TREE_10_24.pkl',
 'X_dev_PREOP_TREE_UNPROC_10_27.pkl',
 'X_dev_PREOP_UNPROC_10_24.pkl',
 'X_dev_PREOP_UNPROC_10_27.pkl',
 'X_test_A_DREF.pkl',
 'X_test_A_DREF_TREE_SKLEARN.pkl',
 'X_test_PREOP_10_24.pkl',
 'X_test_PREOP_10_27.pkl',
 'X_test_PREOP_TREE_10_24.pkl',
 'X_test_PREOP_TREE_UNPROC_10_27.pkl',
 'X_test_PREOP_UNPROC_10_24.pkl',
 'X_test_PREOP_UNPROC_10_27.pkl',
 'X_train_A_DREF.pkl',
 'X_train_A

### Loading Dataset

#### `COMBINED DATASET` from `10/27/19`

In [7]:
data = pd.read_pickle('PREOP_dataset_TREE_10_27.pkl')

In [8]:
data.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,recordId,predstro,cnstrokp,cnstrokttia,cncomaenceph,strokeBin,strokeBin2
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0,1,0.014,2,0,2,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0,2,0.017,2,0,1,0,0


In [9]:
data.shape

(42740, 79)

### Dropping Irrelevant Columns

- `recordId`
- `predstro`
- `cnstrokp`
- `cnstrokttia`
- `cncomaenceph`
- `strokeBin` as we are going to use the more inclusive `strokeBin2` as our outcome variable

In [10]:
cols_to_drop = ['recordId',
                'predstro',
                'cnstrokp',
                'cnstrokttia',
                'cncomaenceph',
                'strokeBin']

In [11]:
len(cols_to_drop)

6

- dropping columns

In [12]:
data = data.drop(cols_to_drop, axis=1)

In [13]:
data.shape

(42740, 73)

- resetting `DataFrame` `index` 

In [14]:
data = data.reset_index(drop=True)

In [15]:
data.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,strokeBin2
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0,0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,7,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,3,0,0,3,3,0,1,0,0,0,0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,7,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1,0,0,4,2,0,0,0,0,0,0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,7,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,0,0,0,0,1,0,0,0,0


In [16]:
data.tail()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft,strokeBin2
42735,62,182.89999,98.3,29.38503,43.3,0.9,4.1,5.5,6.4,50.0,33.0,12,0,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,3,0,0,1,0,1,0,0,0,0
42736,82,165.10001,74.9,27.47816,31.3,1.59,4.0,7.8,10.85,30.0,,12,1,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,0,0,2,1,0,0,0,0,0,0
42737,66,175.3,75.3,24.50367,46.2,0.83,3.8,5.3,7.47,58.0,,12,2,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,2,3,0,0,0,0,1,0,0,0,0
42738,62,165.10001,107.5,39.43794,46.1,0.77,3.8,5.3,7.47,55.0,,12,4,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,2,0,1,0,0,1,0,0,0,0
42739,78,180.3,87.5,26.91638,40.4,1.14,3.8,5.8,8.73,50.0,33.0,12,4,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,3,0,2,2,0,0,0,0,0,1


In [17]:
data.shape

(42740, 73)

### Feature Matrix `X`
- in the next cell you can select which outcome vector you want to use: `strokeBin` or `strokeBin2`

In [18]:
X = data.copy().drop('strokeBin2', axis=1)

In [19]:
X = X.reset_index(drop=True)

In [20]:
X.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0


In [21]:
X.shape

(42740, 72)

### Target Vector `y`

In [22]:
y = data.copy()['strokeBin2']

In [23]:
type(y)

pandas.core.series.Series

In [24]:
y.head(2)

0    0
1    0
Name: strokeBin2, dtype: int64

In [25]:
y.shape

(42740,)

In [26]:
y.unique()

array([0, 1])

In [27]:
y.value_counts()

0    42012
1      728
Name: strokeBin2, dtype: int64

### `X_train`, `X_devtest`, `y_train`, `y_devtest`
- using `train_test_split` with `stratify` parameter to ensure relative proportion of outcome classes are the same in `train`, `dev` and `test` sets
- observation split will be `80/10/10` between `train`, `dev`, `test`

In [28]:
X_train, X_devtest, y_train, y_devtest = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=0,
                                                          stratify=y)

#### validating `train_test_split`

In [29]:
X.shape

(42740, 72)

In [30]:
np.rint(X.shape[0] * 0.20)

8548.0

In [31]:
X_train.shape, X_devtest.shape, y_train.shape, y_devtest.shape

((34192, 72), (8548, 72), (34192,), (8548,))

In [32]:
X.shape[0] - X_train.shape[0] - X_devtest.shape[0]

0

In [33]:
y.shape[0] - y_train.shape[0] - y_devtest.shape[0]

0

#### validating `stratify` worked
- relative proportion of classes in `y`

In [34]:
print (np.round(y.value_counts()[0] / y.shape[0], 4))
print (np.round(y.value_counts()[1] / y.shape[0], 4))

0.983
0.017


- relative proportion of classes in `y_train`

In [35]:
print (np.round(y_train.value_counts()[0] / y_train.shape[0], 4))
print (np.round(y_train.value_counts()[1] / y_train.shape[0], 4))

0.983
0.017


- relative proportion of classes in `y_devtest`

In [36]:
print (np.round(y_devtest.value_counts()[0] / y_devtest.shape[0], 4))
print (np.round(y_devtest.value_counts()[1] / y_devtest.shape[0], 4))

0.9829
0.0171


### `X_dev`, `X_test`, `y_dev`, `y_test`

In [37]:
X_dev, X_test, y_dev, y_test = train_test_split(X_devtest,
                                                y_devtest,
                                                test_size=0.5,
                                                random_state=0,
                                                stratify=y_devtest)

- validating `train_test_split`

In [38]:
X_devtest.shape

(8548, 72)

In [39]:
np.rint(X_devtest.shape[0] * 0.50)

4274.0

In [40]:
X_dev.shape, X_test.shape, y_dev.shape, y_dev.shape

((4274, 72), (4274, 72), (4274,), (4274,))

In [41]:
X_devtest.shape[0] - X_dev.shape[0] - X_test.shape[0]

0

In [42]:
y_devtest.shape[0] - y_dev.shape[0] - y_test.shape[0]

0

#### validating `stratify` worked
- relative proportion of classes in `y`

In [43]:
print (np.round(y.value_counts()[0] / y.shape[0], 4))
print (np.round(y.value_counts()[1] / y.shape[0], 4))

0.983
0.017


- relative proportion of classes in `y_devtest`

In [44]:
print (np.round(y_devtest.value_counts()[0] / y_devtest.shape[0], 4))
print (np.round(y_devtest.value_counts()[1] / y_devtest.shape[0], 4))

0.9829
0.0171


- relative proportion of classes in `y_dev`

In [45]:
print (np.round(y_dev.value_counts()[0] / y_dev.shape[0], 4))
print (np.round(y_dev.value_counts()[1] / y_dev.shape[0], 4))

0.9829
0.0171


- relative proportion of classes in `y_test`

In [46]:
print (np.round(y_test.value_counts()[0] / y_test.shape[0], 4))
print (np.round(y_test.value_counts()[1] / y_test.shape[0], 4))

0.9829
0.0171


### Resetting Indicies

In [47]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [48]:
X_dev = X_dev.reset_index(drop=True)
y_dev = y_dev.reset_index(drop=True)

In [49]:
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Saving Unprocessed `X_train`, `y_train`, `X_dev`, `y_dev`, `X_test`, `y_test`
- can be used in notebooks than incorporate `sklearn` `Pipelines` to more efficiently pre-process data than this notebook

- `train`

In [50]:
#X_train.to_pickle('X_train_PREOP_TREE_UNPROC_10_27.pkl')

In [51]:
#y_train.to_pickle('y_train_PREOP_TREE_UNPROC_10_27.pkl')

- `dev`

In [52]:
#X_dev.to_pickle('X_dev_PREOP_TREE_UNPROC_10_27.pkl')

In [53]:
#y_dev.to_pickle('y_dev_PREOP_TREE_UNPROC_10_27.pkl')

- `test`

In [54]:
#X_test.to_pickle('X_test_PREOP_TREE_UNPROC_10_27.pkl')

In [55]:
#y_test.to_pickle('y_test_PREOP_TREE_UNPROC_10_27.pkl')

### Preprocessing Numerical Features
- need to replace `NaN`s with `train` medians, so as not to leak `dev` or `test` information to `train` set
- `StandardScaler()` on numeric features

In [56]:
X_train.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft
0,43,172.7,96.2,32.25451,24.4,0.81,2.6,4.1,15.42,65.0,71.0,10,2,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,3,1,1,0,0,0,0,0
1,78,162.5,69.9,26.47101,38.2,1.01,,6.7,,57.0,41.0,10,0,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,2,2,1,0,0,0,0,0
2,64,188.0,121.5,34.37641,42.0,0.9,3.6,6.8,6.4,60.0,,10,1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,2,2,0,0,0,0,0,0,0,0
3,71,168.0,85.0,30.11621,42.0,0.9,,5.3,,55.0,30.0,11,3,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,2,0,2,2,0,0,0,0,0
4,58,160.0,93.4,36.48438,27.0,0.8,,6.9,,60.0,29.9,6,4,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,3,0,1,1,0,1,2,0,2


In [57]:
numerical_features = ['age',
                      'heightcm',
                      'weightkg',
                      'bmi',
                      'hct',
                      'creatlst',
                      'totalbumin',
                      'a1clvl',
                      'meldscr',
                      'hdef',
                      'pasys']

In [58]:
len(numerical_features)

11

#### Need to split `X_train` into `X_train_numeric` and `X_train_categorical`
- seems like a hack, but it will make using `StandardScaler()` on `X_dev` and `X_test` much easier

In [59]:
X_train_numeric = X_train[numerical_features]

In [60]:
X_train_categorical = X_train.drop(numerical_features, axis=1)

In [61]:
X_train.shape, X_train_numeric.shape, X_train_categorical.shape

((34192, 72), (34192, 11), (34192, 61))

#### Need to replace `X_train_numeric` features `NaN`s with their respective medians and store the medians for `X_dev` and `X_test`
- first step: determining which features have `NaN`s

In [62]:
for column in X_train_numeric.columns:
    print('{0:20} - {1:10d}'.format(column, X_train_numeric[column].isnull().sum()))
    
print('\r')
print('{0:20} - {1:10d}'.format('Total Row Count', X_train_numeric.shape[0]))

age                  -          0
heightcm             -          0
weightkg             -          0
bmi                  -          0
hct                  -         24
creatlst             -         36
totalbumin           -       3098
a1clvl               -       3717
meldscr              -       4570
hdef                 -        422
pasys                -      15772

Total Row Count      -      34192


- now calculating the respective medians of the features

In [63]:
for column in X_train_numeric.columns:
    print('{0:20} - {1:10f}'.format(column, X_train_numeric[column].median()))

age                  -  67.000000
heightcm             - 172.700000
weightkg             -  87.199997
bmi                  -  29.404575
hct                  -  39.100000
creatlst             -   1.000000
totalbumin           -   3.800000
a1clvl               -   5.900000
meldscr              -   7.500000
hdef                 -  55.000000
pasys                -  35.000000


- checking rows where `NaN`s

In [64]:
X_train_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,43,172.7,96.2,32.25451,24.4,0.81,2.6,4.1,15.42,65.0,71.0
1,78,162.5,69.9,26.47101,38.2,1.01,,6.7,,57.0,41.0
2,64,188.0,121.5,34.37641,42.0,0.9,3.6,6.8,6.4,60.0,
3,71,168.0,85.0,30.11621,42.0,0.9,,5.3,,55.0,30.0
4,58,160.0,93.4,36.48438,27.0,0.8,,6.9,,60.0,29.9


- spotchecking rows with `NaN`s for validation purposes later

In [65]:
X_train_numeric[X_train_numeric['hct'].isnull()].head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
146,79,66.0,163.0,374.19651,,,,,,40.0,
285,59,173.0,147.7,49.35013,,,,,,,
1531,59,172.0,106.0,35.83018,,1.8,,11.4,,45.0,33.6
2016,71,165.10001,99.8,36.61309,,,,,,62.5,24.0
3261,52,178.0,87.0,27.45865,,1.3,,,,50.0,


In [66]:
X_train_numeric[X_train_numeric['hdef'].isnull()].head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
285,59,173.0,147.7,49.35013,,,,,,,
359,83,177.8,90.7,28.69087,36.0,1.0,,6.2,,,
437,78,147.3,63.5,29.26633,31.5,0.93,,7.4,,,
557,72,163.0,65.4,24.61515,41.8,0.8,3.9,7.6,6.4,,
586,73,168.0,94.0,33.30499,43.6,0.87,4.1,7.5,6.4,,


In [67]:
validation_rows = [146, 285]

In [68]:
X_train_numeric.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
146,79,66.0,163.0,374.19651,,,,,,40.0,
285,59,173.0,147.7,49.35013,,,,,,,


- post-cleanup, the `NaN`s above should be replaced with the respective sector median
- `DataFrame` should not have any `NaN`s

#### Creating a `list` of `X_train_numeric` feature medians

In [69]:
train_numeric_medians = []

for column in X_train_numeric.columns.tolist():
    train_numeric_medians.append(X_train_numeric[column].median())

In [70]:
train_numeric_median_df = pd.DataFrame(list(zip(X_train_numeric.columns.tolist(), train_numeric_medians)),
                                       columns=['numeric_feature', 'median'])

In [71]:
train_numeric_median_df

Unnamed: 0,numeric_feature,median
0,age,67.0
1,heightcm,172.7
2,weightkg,87.2
3,bmi,29.40457
4,hct,39.1
5,creatlst,1.0
6,totalbumin,3.8
7,a1clvl,5.9
8,meldscr,7.5
9,hdef,55.0


### Saving `train_numeric_median_df` for later use in `Pipelines` in Future Notebooks

In [72]:
#train_numeric_median_df.to_pickle('train_numeric_medians_10_24.pkl')

#### Now create a function that replaces `NaN`s with corresponding `train` feature `median`

In [73]:
def null_replace_median(df, medians_list):
    
    for column, median in list(zip(df.columns.tolist(), medians_list)):
            df[column] = df[column].replace({np.nan: median})
    
    return df

- applying `null_replace_median` to `X_train_numeric`

In [74]:
X_train_numeric = null_replace_median(X_train_numeric, train_numeric_medians)

- checking if all of the `NaN`s are gone

In [75]:
X_train_numeric.isnull().sum()

age           0
heightcm      0
weightkg      0
bmi           0
hct           0
creatlst      0
totalbumin    0
a1clvl        0
meldscr       0
hdef          0
pasys         0
dtype: int64

- checking to see if `NaN`s replaced with correct `median`

In [76]:
X_train_numeric.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
146,79,66.0,163.0,374.19651,39.1,1.0,3.8,5.9,7.5,40.0,35.0
285,59,173.0,147.7,49.35013,39.1,1.0,3.8,5.9,7.5,55.0,35.0


In [77]:
train_numeric_median_df.iloc[4:, ]

Unnamed: 0,numeric_feature,median
4,hct,39.1
5,creatlst,1.0
6,totalbumin,3.8
7,a1clvl,5.9
8,meldscr,7.5
9,hdef,55.0
10,pasys,35.0


### Using the same code pattern to replace `X_dev_numeric` `NaN`s with `X_train_numeric` `medians`
- creating `X_dev_numeric`

In [78]:
X_dev_numeric = X_dev[numerical_features]

In [79]:
X_dev_categorical = X_dev.drop(numerical_features, axis=1)

In [80]:
X_dev.shape, X_dev_numeric.shape, X_dev_categorical.shape

((4274, 72), (4274, 11), (4274, 61))

In [81]:
X_dev_numeric.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,51,170.2,92.9,32.06983,44.2,1.05,3.6,8.1,6.87,65.0,
1,66,175.0,105.0,34.28571,41.0,0.9,3.9,5.7,7.5,50.0,


- determining features with `NaN`s

In [82]:
for column in X_dev_numeric.columns:
    print('{0:20} - {1:10d}'.format(column, X_dev_numeric[column].isnull().sum()))
    
print('\r')
print('{0:20} - {1:10d}'.format('Total Row Count', X_dev_numeric.shape[0]))

age                  -          0
heightcm             -          0
weightkg             -          0
bmi                  -          0
hct                  -          2
creatlst             -          3
totalbumin           -        356
a1clvl               -        457
meldscr              -        527
hdef                 -         46
pasys                -       1962

Total Row Count      -       4274


- spotchecking rows with `NaN`s for validation purposes later

In [83]:
X_dev_numeric[(X_dev_numeric['hct'].isnull()) | (X_dev_numeric['creatlst'].isnull())]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
639,51,160.0,71.2,27.8125,,0.55,3.8,9.3,6.4,55.0,21.0
743,54,162.60001,93.9,35.51603,,0.78,,6.3,,61.0,41.0
1443,63,158.0,130.0,52.07499,29.0,,3.5,5.9,,,
1527,48,170.0,82.0,28.3737,37.0,,,,,30.0,44.0
3840,68,180.3,99.8,30.70005,46.8,,,,,64.0,


In [84]:
X_dev_numeric[X_dev_numeric['hdef'].isnull()].head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
76,63,163.0,79.5,29.92209,42.0,1.29,4.7,6.1,8.84,,
104,56,180.3,88.5,27.22399,42.1,0.7,4.3,5.5,,,
293,55,188.0,111.0,31.40561,40.0,1.2,4.5,6.4,,,
431,83,180.0,77.0,23.76543,29.0,1.83,2.1,7.3,13.27,,
526,81,152.39999,43.9,18.90143,40.0,0.8,4.5,4.9,6.4,,40.0


In [85]:
validation_rows = [743, 1443, 1527, 3840]

In [86]:
X_dev_numeric.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
743,54,162.60001,93.9,35.51603,,0.78,,6.3,,61.0,41.0
1443,63,158.0,130.0,52.07499,29.0,,3.5,5.9,,,
1527,48,170.0,82.0,28.3737,37.0,,,,,30.0,44.0
3840,68,180.3,99.8,30.70005,46.8,,,,,64.0,


- applying `null_replace_median` function to `X_dev_numeric`

In [87]:
X_dev_numeric = null_replace_median(X_dev_numeric, train_numeric_medians)

- checking if all of the `NaN`s are gone

In [88]:
X_dev_numeric.isnull().sum()

age           0
heightcm      0
weightkg      0
bmi           0
hct           0
creatlst      0
totalbumin    0
a1clvl        0
meldscr       0
hdef          0
pasys         0
dtype: int64

- checking to see if `NaN`s replaced with correct `median`

In [89]:
X_dev_numeric.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
743,54,162.60001,93.9,35.51603,39.1,0.78,3.8,6.3,7.5,61.0,41.0
1443,63,158.0,130.0,52.07499,29.0,1.0,3.5,5.9,7.5,55.0,35.0
1527,48,170.0,82.0,28.3737,37.0,1.0,3.8,5.9,7.5,30.0,44.0
3840,68,180.3,99.8,30.70005,46.8,1.0,3.8,5.9,7.5,64.0,35.0


In [90]:
train_numeric_median_df.iloc[4:, ]

Unnamed: 0,numeric_feature,median
4,hct,39.1
5,creatlst,1.0
6,totalbumin,3.8
7,a1clvl,5.9
8,meldscr,7.5
9,hdef,55.0
10,pasys,35.0


### Using the same code pattern to replace `X_test_numeric` `NaN`s with `X_train_numeric` `medians`
- creating `X_test_numeric` and `X_test_categorical`

In [91]:
X_test_numeric = X_test[numerical_features]

In [92]:
X_test_categorical = X_test.drop(numerical_features, axis=1)

In [93]:
X_test.shape, X_test_numeric.shape, X_test_categorical.shape

((4274, 72), (4274, 11), (4274, 61))

- determining features with `NaN`s

In [94]:
for column in X_test_numeric.columns:
    print('{0:20} - {1:10d}'.format(column, X_test_numeric[column].isnull().sum()))
    
print('\r')
print('{0:20} - {1:10d}'.format('Total Row Count', X_test_numeric.shape[0]))

age                  -          0
heightcm             -          0
weightkg             -          0
bmi                  -          0
hct                  -          0
creatlst             -          6
totalbumin           -        393
a1clvl               -        444
meldscr              -        556
hdef                 -         62
pasys                -       1903

Total Row Count      -       4274


- spotchecking rows with `NaN`s for validation purposes later

In [95]:
X_test_numeric[(X_test_numeric['hdef'].isnull()) | (X_test_numeric['a1clvl'].isnull())].head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
2,76,190.5,77.1,21.24538,41.2,0.7,,5.4,,,
5,57,157.5,64.4,25.9612,39.0,0.7,4.4,,6.76,58.0,
6,52,177.8,52.2,16.51228,45.2,,,,,67.0,
33,54,173.0,87.0,29.0688,38.9,0.7,,,,,
48,70,170.2,85.8,29.61885,36.8,1.1,2.7,,7.31,50.0,36.2


In [96]:
X_test_numeric[X_test_numeric['hdef'].isnull()].head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
2,76,190.5,77.1,21.24538,41.2,0.7,,5.4,,,
33,54,173.0,87.0,29.0688,38.9,0.7,,,,,
116,62,170.0,59.0,20.41522,42.6,0.88,,,,,
128,81,190.0,98.0,27.14681,40.6,0.85,3.9,5.6,6.4,,
132,65,180.0,100.0,30.8642,44.0,1.03,3.5,5.3,6.68,,


In [97]:
X_test_numeric[X_test_numeric['hct'].isnull()].head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys


In [98]:
validation_rows = [2, 6]

In [99]:
X_test_numeric.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
2,76,190.5,77.1,21.24538,41.2,0.7,,5.4,,,
6,52,177.8,52.2,16.51228,45.2,,,,,67.0,


- applying `null_replace_median` function to `X_test_numeric`

In [100]:
X_test_numeric = null_replace_median(X_test_numeric, train_numeric_medians)

- checking if all of the `NaN`s are gone

In [101]:
X_test_numeric.isnull().sum()

age           0
heightcm      0
weightkg      0
bmi           0
hct           0
creatlst      0
totalbumin    0
a1clvl        0
meldscr       0
hdef          0
pasys         0
dtype: int64

- checking to see if `NaN`s replaced with correct `median`

In [102]:
X_test_numeric.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
2,76,190.5,77.1,21.24538,41.2,0.7,3.8,5.4,7.5,55.0,35.0
6,52,177.8,52.2,16.51228,45.2,1.0,3.8,5.9,7.5,67.0,35.0


In [103]:
train_numeric_median_df

Unnamed: 0,numeric_feature,median
0,age,67.0
1,heightcm,172.7
2,weightkg,87.2
3,bmi,29.40457
4,hct,39.1
5,creatlst,1.0
6,totalbumin,3.8
7,a1clvl,5.9
8,meldscr,7.5
9,hdef,55.0


### Applying `StandardScaler()` to `X_train_numeric`, `X_dev_numeric`, `X_test_numeric`
- `fit_transform` on `X_train_numeric`, `transform` on `X_dev_numeric`, `X_test_numeric`

In [104]:
X_train_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,43,172.7,96.2,32.25451,24.4,0.81,2.6,4.1,15.42,65.0,71.0
1,78,162.5,69.9,26.47101,38.2,1.01,3.8,6.7,7.5,57.0,41.0
2,64,188.0,121.5,34.37641,42.0,0.9,3.6,6.8,6.4,60.0,35.0
3,71,168.0,85.0,30.11621,42.0,0.9,3.8,5.3,7.5,55.0,30.0
4,58,160.0,93.4,36.48438,27.0,0.8,3.8,6.9,7.5,60.0,29.9


In [105]:
X_train.shape

(34192, 72)

- creating copy of `X_train_numeric` for validation purposes

In [106]:
train_ss_df = X_train_numeric.copy()

In [107]:
train_ss_df['age'][0], train_ss_df['age'].mean(), train_ss_df['age'].std()

(43, 66.18679808142255, 10.913798229791013)

- creating `scaler`

In [108]:
scaler = StandardScaler()

- fitting and transforming `X_train_numeric`

In [109]:
X_train_numeric = scaler.fit_transform(X_train_numeric)

In [110]:
type(X_train_numeric)

numpy.ndarray

- `StandardScaler()` returns a `numpy.ndarry` so we will turn it back to a `DataFrame`

In [111]:
X_train_numeric = pd.DataFrame(X_train_numeric,
                               columns=numerical_features)

In [112]:
X_train_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,-2.12457,0.11594,0.32942,0.157,-2.65268,-0.35221,-2.27155,-1.62927,2.47597,0.98148,3.58585
1,1.08243,-0.82771,-0.95497,-0.39563,-0.10617,-0.13691,0.04597,0.25206,-0.29493,0.33103,0.49141
2,-0.20037,1.5314,1.56497,0.35976,0.59504,-0.25533,-0.34028,0.32442,-0.67978,0.57495,-0.12748
3,0.44103,-0.31888,-0.21754,-0.04732,0.59504,-0.25533,0.04597,-0.76096,-0.29493,0.16842,-0.64322
4,-0.75014,-1.059,0.19268,0.56118,-2.1729,-0.36298,0.04597,0.39678,-0.29493,0.57495,-0.65353


In [113]:
X_train_numeric.shape

(34192, 11)

- validating scaling

In [114]:
(train_ss_df['age'][0] - train_ss_df['age'].mean()) / train_ss_df['age'].std()

-2.124539742555471

#### Applying `StandardScaler` to `X_dev_numeric`
- Remember `transform` only

In [115]:
X_dev_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,51,170.2,92.9,32.06983,44.2,1.05,3.6,8.1,6.87,65.0,35.0
1,66,175.0,105.0,34.28571,41.0,0.9,3.9,5.7,7.5,50.0,35.0
2,63,173.0,105.7,35.31692,39.0,1.3,4.2,6.5,9.1,33.0,45.0
3,73,175.0,91.7,29.94286,38.5,0.8,3.5,5.2,6.4,58.0,30.0
4,70,182.89999,86.2,25.76795,42.5,1.2,3.3,5.9,11.95,48.0,35.0


In [116]:
X_dev_numeric.shape

(4274, 11)

- creating copy of `X_dev_numeric` for validation purposes

In [117]:
dev_ss_df = X_dev_numeric.copy()

In [118]:
dev_ss_df['age'][0], train_ss_df['age'].mean(), train_ss_df['age'].std()

(51, 66.18679808142255, 10.913798229791013)

- transforming `X_dev_numeric`

In [119]:
X_dev_numeric = scaler.transform(X_dev_numeric)

In [120]:
X_dev_numeric = pd.DataFrame(X_dev_numeric,
                             columns=numerical_features)

In [121]:
X_dev_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,-1.39154,-0.11535,0.16826,0.13936,1.00101,-0.09385,-0.34028,1.26509,-0.51535,0.98148,-0.12748
1,-0.01712,0.32872,0.75918,0.35109,0.41051,-0.25533,0.2391,-0.47153,-0.29493,-0.23811,-0.12748
2,-0.292,0.14369,0.79336,0.44963,0.04145,0.17527,0.81848,0.10734,0.26484,-1.62031,0.904
3,0.62428,0.32872,0.10966,-0.06388,-0.05081,-0.36298,-0.53341,-0.83332,-0.67978,0.41234,-0.64322
4,0.3494,1.05958,-0.15894,-0.46281,0.68731,0.06762,-0.91966,-0.32681,1.26195,-0.40072,-0.12748


In [122]:
X_dev_numeric.shape

(4274, 11)

- validating scaling

In [123]:
(dev_ss_df['age'][0] - train_ss_df['age'].mean()) / train_ss_df['age'].std()

-1.3915227092954385

#### Applying `StandardScaler` to `X_test_numeric`
- Remember `transform` only

In [124]:
X_test_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,81,170.0,97.7,33.80623,37.0,0.9,4.6,6.2,6.4,60.0,38.0
1,65,180.3,77.1,23.71717,44.2,1.02,3.8,5.2,7.5,60.0,29.3
2,76,190.5,77.1,21.24538,41.2,0.7,3.8,5.4,7.5,55.0,35.0
3,80,183.0,114.0,34.04103,37.0,0.9,3.7,5.8,7.5,75.0,33.0
4,70,190.0,108.0,29.9169,45.0,1.2,3.2,8.2,8.2,80.0,34.0


In [125]:
X_test_numeric.shape

(4274, 11)

- creating copy of `X_test_numeric` for validation purposes

In [126]:
test_ss_df = X_test_numeric.copy()

In [127]:
test_ss_df['age'][0], train_ss_df['age'].mean(), train_ss_df['age'].std()

(81, 66.18679808142255, 10.913798229791013)

- transforming `X_test_numeric`

In [128]:
X_test_numeric = scaler.transform(X_test_numeric)

In [129]:
X_test_numeric = pd.DataFrame(X_test_numeric,
                              columns=numerical_features)

In [130]:
X_test_numeric.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,1.35731,-0.13385,0.40267,0.30528,-0.32761,-0.25533,1.59099,-0.10973,-0.67978,0.57495,0.18197
1,-0.10874,0.81904,-0.60335,-0.65877,1.00101,-0.12615,0.04597,-0.83332,-0.29493,0.57495,-0.71542
2,0.89917,1.76269,-0.60335,-0.89496,0.44742,-0.47063,0.04597,-0.6886,-0.29493,0.16842,-0.12748
3,1.26568,1.06883,1.1987,0.32771,-0.32761,-0.25533,-0.14715,-0.39917,-0.29493,1.79453,-0.33377
4,0.3494,1.71643,0.90568,-0.06636,1.14863,0.06762,-1.11279,1.33745,-0.05003,2.20106,-0.23063


In [131]:
X_test_numeric.shape

(4274, 11)

- validating scaling

In [132]:
(test_ss_df['age'][0] - train_ss_df['age'].mean()) / train_ss_df['age'].std()

1.3572911654296824

### Re-Assembling Feature Matricies

#### `X_train` = `X_train_numeric` + `X_train_categorical`

In [133]:
X_train_numeric.shape, X_train_categorical.shape

((34192, 11), (34192, 61))

In [134]:
X_train = pd.concat([X_train_numeric, 
                    X_train_categorical],
                    axis=1)

In [135]:
X_train.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft
0,-2.12457,0.11594,0.32942,0.157,-2.65268,-0.35221,-2.27155,-1.62927,2.47597,0.98148,3.58585,10,2,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,3,1,1,0,0,0,0,0
1,1.08243,-0.82771,-0.95497,-0.39563,-0.10617,-0.13691,0.04597,0.25206,-0.29493,0.33103,0.49141,10,0,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,2,2,1,0,0,0,0,0
2,-0.20037,1.5314,1.56497,0.35976,0.59504,-0.25533,-0.34028,0.32442,-0.67978,0.57495,-0.12748,10,1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,2,2,0,0,0,0,0,0,0,0
3,0.44103,-0.31888,-0.21754,-0.04732,0.59504,-0.25533,0.04597,-0.76096,-0.29493,0.16842,-0.64322,11,3,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,2,0,2,2,0,0,0,0,0
4,-0.75014,-1.059,0.19268,0.56118,-2.1729,-0.36298,0.04597,0.39678,-0.29493,0.57495,-0.65353,6,4,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,3,0,1,1,0,1,2,0,2


In [136]:
X_train.shape

(34192, 72)

#### `X_dev` = `X_dev_numeric` + `X_dev_categorical`

In [137]:
X_dev_numeric.shape, X_dev_categorical.shape

((4274, 11), (4274, 61))

In [138]:
X_dev = pd.concat([X_dev_numeric, 
                   X_dev_categorical],
                  axis=1)

In [139]:
X_dev.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft
0,-1.39154,-0.11535,0.16826,0.13936,1.00101,-0.09385,-0.34028,1.26509,-0.51535,0.98148,-0.12748,8,1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,0,1,1,0,1,0,0,0
1,-0.01712,0.32872,0.75918,0.35109,0.41051,-0.25533,0.2391,-0.47153,-0.29493,-0.23811,-0.12748,1,2,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,0,1,1,0,1,0,0,0
2,-0.292,0.14369,0.79336,0.44963,0.04145,0.17527,0.81848,0.10734,0.26484,-1.62031,0.904,5,0,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,2,0,3,4,3,0,0,0,0,0
3,0.62428,0.32872,0.10966,-0.06388,-0.05081,-0.36298,-0.53341,-0.83332,-0.67978,0.41234,-0.64322,9,3,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,0,1,1,0,1,1,0,0
4,0.3494,1.05958,-0.15894,-0.46281,0.68731,0.06762,-0.91966,-0.32681,1.26195,-0.40072,-0.12748,3,5,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,3,0,3,1,0,1,0,0,0


In [140]:
X_dev.shape

(4274, 72)

#### `X_test` = `X_test_numeric` + `X_test_categorical`

In [141]:
X_test_numeric.shape, X_test_categorical.shape

((4274, 11), (4274, 61))

In [142]:
X_test = pd.concat([X_test_numeric, 
                    X_test_categorical],
                   axis=1)

In [143]:
X_test.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft
0,1.35731,-0.13385,0.40267,0.30528,-0.32761,-0.25533,1.59099,-0.10973,-0.67978,0.57495,0.18197,7,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,2,0,0,0,0,0,0,0,0,0
1,-0.10874,0.81904,-0.60335,-0.65877,1.00101,-0.12615,0.04597,-0.83332,-0.29493,0.57495,-0.71542,7,1,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,2,1,0,2,0,0,0,0,0,0
2,0.89917,1.76269,-0.60335,-0.89496,0.44742,-0.47063,0.04597,-0.6886,-0.29493,0.16842,-0.12748,6,1,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0,0,0,0,0,0,0,3,0,0
3,1.26568,1.06883,1.1987,0.32771,-0.32761,-0.25533,-0.14715,-0.39917,-0.29493,1.79453,-0.33377,8,3,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0,1,0,0,0,1,0,0,0,0
4,0.3494,1.71643,0.90568,-0.06636,1.14863,0.06762,-1.11279,1.33745,-0.05003,2.20106,-0.23063,6,4,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1,0,2,1,2,0,1,0,0,0


In [144]:
X_test.shape

(4274, 72)

- final validations

In [145]:
X.shape, y.shape

((42740, 72), (42740,))

In [146]:
X_train.shape, y_train.shape

((34192, 72), (34192,))

In [147]:
X_dev.shape, y_dev.shape

((4274, 72), (4274,))

In [148]:
X_test.shape, y_test.shape

((4274, 72), (4274,))

In [149]:
X.shape[0] - X_train.shape[0] - X_dev.shape[0] - X_test.shape[0]

0

In [150]:
y.shape[0] - y_train.shape[0] - y_dev.shape[0] - y_test.shape[0]

0

### Pickle `DataFrames` for Use in Feature Selection and Modeling Notebooks

#### Pre-split `X` and `y`

In [151]:
X.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvdcarsten,cvdstenrt,cvdstenlft
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,7,4,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,3,3,0,4,2,0,0,0,0,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7,5,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,3,4,0,3,2,0,2,0,0,0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,7,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,3,0,0,3,3,0,1,0,0,0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,7,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1,0,0,4,2,0,0,0,0,0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,7,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,3,4,0,0,0,0,1,0,0,0


#### Note - numerical features in `X` still have `NaN`s at this point and is Unscaled

In [152]:
#X.to_pickle('X_PREOP_TREE_10_27.pkl')

In [153]:
#y.to_pickle('y_PREOP_TREE_10_27.pkl')

#### `train`

In [154]:
#X_train.to_pickle('X_train_PREOP_TREE_10_27.pkl')

In [155]:
#y_train.to_pickle('y_train_PREOP_TREE_10_27.pkl')

#### `dev`

In [156]:
#X_dev.to_pickle('X_dev_PREOP_TREE_10_27.pkl')

In [157]:
#y_dev.to_pickle('y_dev_PREOP_TREE_10_27.pkl')

#### `test`

In [158]:
#X_test.to_pickle('X_test_PREOP_TREE_10_27.pkl')

In [159]:
#y_test.to_pickle('y_test_PREOP_TREE_10_27.pkl')