## Capstone Project

### Feature Selection

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# scikit-learn libraries for feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFECV

# scikit-learn libraries for learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None

#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [4]:
os.chdir('../data')

In [5]:
sorted(os.listdir())

['.DS_Store',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'capstone_STS_risk_factor_features.xlsx',
 'capstone_cleaned_data.csv',
 'capstone_data-version-2.xlsx',
 'capstone_data.xlsx',
 'capstone_data_binarized_outcome.pkl',
 'capstone_data_binarized_outcome.xlsx',
 'capstone_data_binarized_outcome_compressed.pkl',
 'capstone_data_filled_in_complication_data.xlsx',
 'capstone_data_key_variable_nulls_cleaned.pkl',
 'capstone_data_key_variable_nulls_cleaned.xlsx',
 'capstone_data_key_variable_nulls_cleaned_REF.pkl',
 'capstone_data_key_variable_nulls_cleaned_compressed.pkl',
 'pre_op_features.pkl',
 'pre_op_features_A.pkl',
 'pre_op_features_A_DREF.pkl',
 'pre_op_features_B.pkl',
 'pre_op_features_B_DREF.pkl',
 'pre_op_features_DREF.pkl',
 'pre_op_features_tree.pkl',
 'pre_op_features_tree_A.pkl',
 'pre_op_featur

#### Loading Dataset

In [6]:
data = pd.read_pickle('pre_op_features_A_DREF.pkl')

In [7]:
data.head()

Unnamed: 0,recordId,surgdt,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,predstro,strokeBin,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,1,2011-07-01,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0.014,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,5 days,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0
1,2,2011-07-02,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0.017,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,7 days,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0
2,3,2011-07-04,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0.045,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,8 days,0,1,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1.0,1.0,1.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0
3,4,2011-07-05,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0.013,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,4 days,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0
4,5,2011-07-06,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0.016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,4 days,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0


In [8]:
data.shape

(19756, 135)

#### Preprocessing

- dropping columns that you don't need in this analysis
- `recordID`, `surgdt`, `predstro`

In [9]:
drop_cols = ['recordId', 'surgdt', 'predstro']

In [10]:
data = data.drop(drop_cols, axis=1)

In [11]:
data.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,strokeBin,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,5 days,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,7 days,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0


In [12]:
data.shape

(19756, 132)

#### Creating `feature_matrix`

In [13]:
feature_matrix = data.copy().drop('strokeBin', axis=1)

In [14]:
feature_matrix.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,length_stay,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,5 days,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,7 days,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0


In [15]:
feature_matrix.shape

(19756, 131)

### Thresholding Feature Variance
- Albon 10.1, 10.2 pages 170-172

#### Variance Thresholding Numerical Features
- need to subset numerical features - but VT will not work when feature sets contain different units (page 171) as is the case here
- replace `NaN`s with `median`
- could use `sklearn` `VarianceThreshold`, but let's manually calculate variances and standard deviations

In [16]:
numerical_features = ['age', 
                      'heightcm', 
                      'weightkg', 
                      'bmi', 
                      'hct', 
                      'creatlst', 
                      'totalbumin', 
                      'a1clvl',
                      'meldscr',
                      'hdef',
                      'pasys',
                      'length_stay'] # don't forget this is a numeric feature

In [17]:
len(numerical_features)

12

In [18]:
numerical_df = feature_matrix.copy()[numerical_features]

In [19]:
numerical_df.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,length_stay
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,5 days
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,7 days


In [20]:
numerical_df.shape

(19756, 12)

- replacing `NaN`s with `median`

In [21]:
numerical_df['a1clvl'].median()

5.9000001

In [22]:
numerical_df['meldscr'].median()

7.5

In [23]:
numerical_df = numerical_df.fillna(numerical_df.median())

In [24]:
numerical_df.isnull().sum()

age            0
heightcm       0
weightkg       0
bmi            0
hct            0
creatlst       0
totalbumin     0
a1clvl         0
meldscr        0
hdef           0
pasys          0
length_stay    0
dtype: int64

In [25]:
numerical_df.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,length_stay
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,5 days
1,65,175.3,79.4,25.83787,45.0,1.2,3.8,5.9,7.5,55.0,40.0,7 days


- some additional pre-processing on `length_stay` 
- need to convert `timedelta` to `int`

In [26]:
numerical_df['length_stay'] = numerical_df['length_stay'].dt.days.astype('int16')

### Calculating Variance of the Numerical Features
- per Albon page 171, `Variance Thresholding` will not work when feature sets contain different units (e.g., one feature is in years while a different feature is in kg)
- then calculated the `standard deviation` of the numerical features

In [27]:
pd.DataFrame(numerical_df.var(axis=0),
             columns=['variance'])

Unnamed: 0,variance
age,122.44711
heightcm,117.83292
weightkg,417.78352
bmi,124.06981
hct,28.49372
creatlst,0.87786
totalbumin,0.27108
a1clvl,1.87235
meldscr,8.31259
hdef,150.6819


In [28]:
pd.DataFrame(numerical_df.std(axis=0),
             columns=['standard_deviation'])

Unnamed: 0,standard_deviation
age,11.06558
heightcm,10.85509
weightkg,20.43975
bmi,11.13866
hct,5.33795
creatlst,0.93694
totalbumin,0.52065
a1clvl,1.36834
meldscr,2.88316
hdef,12.27526


#### Variance Thresholding for categorical features

- how to select columns excluding a subset

In [29]:
cat_features_df = feature_matrix.copy()

In [30]:
cat_features_df = cat_features_df[cat_features_df.columns[~cat_features_df.columns.isin(numerical_features)]]

In [31]:
cat_features_df.head(2)

Unnamed: 0,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0


In [32]:
cat_features_df.shape

(19756, 119)

- idea is to look to eliminate features where the vast proportion of observations is one class - say > 75%
- Albon uses the threshold of 75% of observation in one class (page 172)
- In binary features (i.e, Bernoulli random variables), variance is calculated as:
- `Var (x) = p(1 - p)`
- where `p` is the proportion of observations of class 1
- so if 75% is defined as the threshold, then want to remove features with `Variance < 0.1875)`

In [33]:
cat_variance = pd.DataFrame(cat_features_df.var(),
                            columns=['variance']).sort_values(by=['variance'],
                                                              ascending=False)

In [34]:
cat_variance.head()

Unnamed: 0,variance
status_URGENT,0.24992
anginalclass_REST,0.2498
diabetes,0.24245
vdaort,0.23348
surgdt_PartOfMonth_End,0.22194


- categorical features with `variance > 0.15`

In [35]:
cat_variance[cat_variance['variance'] >= 0.15]

Unnamed: 0,variance
status_URGENT,0.24992
anginalclass_REST,0.2498
diabetes,0.24245
vdaort,0.23348
surgdt_PartOfMonth_End,0.22194
prcvint,0.22064
surgdt_PartOfMonth_Beg,0.21877
gender,0.21058
cigsmoker,0.20434
vdstena,0.17324


In [36]:
cat_variance[cat_variance['variance'] >= 0.15].shape

(20, 1)

### Handling Highly Correlated Features

- Albon 10.3, pages 172-174
- checking to see if features are highly correlated
- creating correlation matrix

In [37]:
corr_matrix = feature_matrix.corr().abs()

In [38]:
type(corr_matrix)

pandas.core.frame.DataFrame

In [39]:
corr_matrix.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
age,1.0,0.14509,0.2109,0.08379,0.15212,0.00928,0.05879,0.12195,0.08528,0.0611,0.09755,0.00976,0.01644,0.00102,0.00104,0.00349,0.01144,0.00341,0.00368,0.00294,0.01497,0.01592,0.00993,0.00426,0.00945,0.00748,0.0176,0.01046,0.00293,0.00187,0.00391,0.01026,0.04483,0.00367,0.04098,0.06269,0.0818,0.12554,0.11827,0.02001,0.02239,0.00525,0.0249,0.00542,0.06931,0.09663,0.05713,0.13403,0.10125,0.08125,0.03213,0.05562,0.00508,0.01141,0.10655,0.08643,,0.05567,0.00895,0.01844,,0.28893,0.31577,0.02991,0.025,0.08903,0.03328,0.07504,0.08177,0.01182,0.00266,0.2389,0.26099,0.00442,,,0.06919,0.04995,,,0.00633,0.00567,0.0562,0.05707,,,,,0.00673,0.06428,0.0677,0.02333,0.03481,0.09803,0.07211,0.05838,0.04465,0.07756,0.08816,0.02145,0.05383,0.00078,0.01406,0.02082,0.06387,0.03578,0.01069,0.12378,0.04268,0.09039,0.1083,0.03445,0.01897,0.00894,,0.03194,0.01309,,0.02408,0.00014,,0.1433,,,,,,,0.16834,0.09795
heightcm,0.14509,1.0,0.37941,0.31306,0.22087,0.04963,0.05448,0.01277,0.06236,0.08756,0.05554,0.0134,0.00825,0.01442,0.006,0.01101,0.00897,0.00328,0.01661,0.00651,0.00277,0.01491,0.00676,0.01193,0.00056,0.00462,0.00391,0.00947,0.01442,0.00494,0.00692,0.01672,0.00969,0.01081,0.0204,0.02235,0.64529,0.03509,0.00883,0.05379,0.00325,0.00589,0.04607,0.03263,0.02212,0.00816,0.01334,0.03271,0.01785,0.0132,0.04985,0.00919,0.03412,0.04433,0.01034,0.02078,,0.00264,0.00044,0.01073,,0.04253,0.0363,0.00652,0.02056,0.0106,0.00777,0.05811,0.06314,0.00396,0.00971,0.07501,0.08268,0.06763,,,0.08735,0.03246,,,0.0489,0.00558,0.01938,0.06383,,,,,0.02476,0.01103,0.04701,0.0268,0.02129,0.04164,0.0416,0.033,0.02951,0.04912,0.06908,0.03151,0.00343,0.00323,0.00274,0.00172,0.00261,0.00595,0.00645,0.06136,0.03036,0.05316,0.0322,0.00444,0.02382,0.01867,,0.01974,0.00827,,0.02352,0.01586,,0.01388,,,,,,,0.02791,0.00428
weightkg,0.2109,0.37941,1.0,0.57497,0.15229,0.04192,0.01629,0.18166,0.0525,0.04337,0.11256,0.00543,0.00685,0.00482,0.00309,0.01007,0.00379,0.01044,0.00084,0.0033,0.00202,0.00889,0.0153,0.0167,0.00243,0.00172,0.00117,0.00051,0.00062,0.00179,0.00961,0.0114,0.01148,0.00101,0.00642,0.00592,0.29723,0.03339,0.00288,0.07703,0.0044,0.0117,0.02118,0.21678,0.1587,0.09357,0.00495,0.09741,0.01837,0.01602,0.26805,0.01177,0.03111,0.03921,0.01703,0.0542,,0.03907,0.0033,0.00901,,0.0525,0.06341,0.02094,0.04819,0.00388,0.01677,0.01559,0.00131,0.00422,0.00891,0.06318,0.00947,0.03464,,,0.0019,0.03137,,,0.05488,0.00593,0.06173,0.05917,,,,,0.02825,0.03383,0.01008,0.00962,0.02701,0.01763,0.05385,0.10889,0.01956,0.05435,0.0695,0.03351,0.00061,0.00803,0.00459,0.00786,0.00296,0.01545,0.00256,0.06,0.03298,0.02907,0.04504,0.0079,0.02192,0.01751,,0.01487,0.00249,,0.02429,0.0085,,0.02289,,,,,,,0.02237,0.00419
bmi,0.08379,0.31306,0.57497,1.0,0.01705,0.01291,0.02283,0.11373,0.01234,0.00207,0.10996,0.0078,0.0067,0.01719,0.00448,0.00279,0.00904,0.00708,0.00909,0.00429,0.01192,0.01364,0.01119,0.00478,0.00412,0.00061,0.00171,0.00497,0.00959,0.00175,0.00015,0.00919,0.0078,0.00065,0.00265,0.00382,0.03868,0.00571,0.00741,0.03571,0.00037,0.00562,0.00166,0.14885,0.11097,0.06198,0.00245,0.07165,0.01522,0.0167,0.14615,0.00678,0.00681,0.01256,0.01578,0.02633,,0.02824,0.00346,0.00254,,0.04583,0.05476,0.01127,0.0169,0.00291,0.01171,0.01405,0.0208,0.00188,0.01173,0.01924,0.02054,0.00135,,,0.02813,0.01001,,,0.01937,0.0089,0.03152,0.01697,,,,,0.01552,0.01833,0.0088,0.00885,0.01113,0.00322,0.0185,0.05778,0.00165,0.01731,0.02553,0.01265,0.00413,0.00751,0.00208,0.00581,0.00932,0.01644,0.00426,0.01641,0.0118,0.00169,0.01417,0.00933,0.0037,0.00538,,0.01315,0.00657,,0.00899,0.00586,,0.01504,,,,,,,0.0061,0.00214
hct,0.15212,0.22087,0.15229,0.01705,1.0,0.22058,0.38003,0.06385,0.27816,0.04424,0.17431,0.01661,0.00342,0.01177,0.00401,0.00091,0.00058,0.0041,0.00417,0.00856,5e-05,0.00091,0.00223,0.00632,0.00784,0.00919,0.01944,0.00861,0.00616,0.01336,0.02669,0.00295,0.03407,0.03052,0.02834,0.05614,0.2828,0.13059,0.14078,0.00086,0.0023,0.003,0.02196,0.15522,0.1637,0.00297,0.20992,0.07359,0.12778,0.14278,0.02261,0.03576,0.09469,0.03257,0.05495,0.10317,,0.03971,0.02227,0.02222,,0.04185,0.05594,0.09869,0.03799,0.03411,0.07416,0.18237,0.14565,0.05719,0.00901,0.06491,0.0194,0.05016,,,0.05648,0.08822,,,0.02257,0.01835,0.02251,0.01555,,,,,0.06742,0.05249,0.10015,0.15831,0.01628,0.05507,0.10978,0.02151,0.00372,0.07131,0.10176,0.04964,0.05073,0.0225,0.03373,0.01488,0.20957,0.01246,0.00361,0.12138,0.09697,0.05373,0.06678,0.02214,0.03541,0.02041,,0.0304,0.00461,,0.04587,0.0041,,0.06016,,,,,,,0.07108,0.016


- selecting upper triangle of correlation matrix

In [40]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                  k=1).astype(np.bool))

In [41]:
type(upper)

pandas.core.frame.DataFrame

In [42]:
upper.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
age,,0.14509,0.2109,0.08379,0.15212,0.00928,0.05879,0.12195,0.08528,0.0611,0.09755,0.00976,0.01644,0.00102,0.00104,0.00349,0.01144,0.00341,0.00368,0.00294,0.01497,0.01592,0.00993,0.00426,0.00945,0.00748,0.0176,0.01046,0.00293,0.00187,0.00391,0.01026,0.04483,0.00367,0.04098,0.06269,0.0818,0.12554,0.11827,0.02001,0.02239,0.00525,0.0249,0.00542,0.06931,0.09663,0.05713,0.13403,0.10125,0.08125,0.03213,0.05562,0.00508,0.01141,0.10655,0.08643,,0.05567,0.00895,0.01844,,0.28893,0.31577,0.02991,0.025,0.08903,0.03328,0.07504,0.08177,0.01182,0.00266,0.2389,0.26099,0.00442,,,0.06919,0.04995,,,0.00633,0.00567,0.0562,0.05707,,,,,0.00673,0.06428,0.0677,0.02333,0.03481,0.09803,0.07211,0.05838,0.04465,0.07756,0.08816,0.02145,0.05383,0.00078,0.01406,0.02082,0.06387,0.03578,0.01069,0.12378,0.04268,0.09039,0.1083,0.03445,0.01897,0.00894,,0.03194,0.01309,,0.02408,0.00014,,0.1433,,,,,,,0.16834,0.09795
heightcm,,,0.37941,0.31306,0.22087,0.04963,0.05448,0.01277,0.06236,0.08756,0.05554,0.0134,0.00825,0.01442,0.006,0.01101,0.00897,0.00328,0.01661,0.00651,0.00277,0.01491,0.00676,0.01193,0.00056,0.00462,0.00391,0.00947,0.01442,0.00494,0.00692,0.01672,0.00969,0.01081,0.0204,0.02235,0.64529,0.03509,0.00883,0.05379,0.00325,0.00589,0.04607,0.03263,0.02212,0.00816,0.01334,0.03271,0.01785,0.0132,0.04985,0.00919,0.03412,0.04433,0.01034,0.02078,,0.00264,0.00044,0.01073,,0.04253,0.0363,0.00652,0.02056,0.0106,0.00777,0.05811,0.06314,0.00396,0.00971,0.07501,0.08268,0.06763,,,0.08735,0.03246,,,0.0489,0.00558,0.01938,0.06383,,,,,0.02476,0.01103,0.04701,0.0268,0.02129,0.04164,0.0416,0.033,0.02951,0.04912,0.06908,0.03151,0.00343,0.00323,0.00274,0.00172,0.00261,0.00595,0.00645,0.06136,0.03036,0.05316,0.0322,0.00444,0.02382,0.01867,,0.01974,0.00827,,0.02352,0.01586,,0.01388,,,,,,,0.02791,0.00428
weightkg,,,,0.57497,0.15229,0.04192,0.01629,0.18166,0.0525,0.04337,0.11256,0.00543,0.00685,0.00482,0.00309,0.01007,0.00379,0.01044,0.00084,0.0033,0.00202,0.00889,0.0153,0.0167,0.00243,0.00172,0.00117,0.00051,0.00062,0.00179,0.00961,0.0114,0.01148,0.00101,0.00642,0.00592,0.29723,0.03339,0.00288,0.07703,0.0044,0.0117,0.02118,0.21678,0.1587,0.09357,0.00495,0.09741,0.01837,0.01602,0.26805,0.01177,0.03111,0.03921,0.01703,0.0542,,0.03907,0.0033,0.00901,,0.0525,0.06341,0.02094,0.04819,0.00388,0.01677,0.01559,0.00131,0.00422,0.00891,0.06318,0.00947,0.03464,,,0.0019,0.03137,,,0.05488,0.00593,0.06173,0.05917,,,,,0.02825,0.03383,0.01008,0.00962,0.02701,0.01763,0.05385,0.10889,0.01956,0.05435,0.0695,0.03351,0.00061,0.00803,0.00459,0.00786,0.00296,0.01545,0.00256,0.06,0.03298,0.02907,0.04504,0.0079,0.02192,0.01751,,0.01487,0.00249,,0.02429,0.0085,,0.02289,,,,,,,0.02237,0.00419
bmi,,,,,0.01705,0.01291,0.02283,0.11373,0.01234,0.00207,0.10996,0.0078,0.0067,0.01719,0.00448,0.00279,0.00904,0.00708,0.00909,0.00429,0.01192,0.01364,0.01119,0.00478,0.00412,0.00061,0.00171,0.00497,0.00959,0.00175,0.00015,0.00919,0.0078,0.00065,0.00265,0.00382,0.03868,0.00571,0.00741,0.03571,0.00037,0.00562,0.00166,0.14885,0.11097,0.06198,0.00245,0.07165,0.01522,0.0167,0.14615,0.00678,0.00681,0.01256,0.01578,0.02633,,0.02824,0.00346,0.00254,,0.04583,0.05476,0.01127,0.0169,0.00291,0.01171,0.01405,0.0208,0.00188,0.01173,0.01924,0.02054,0.00135,,,0.02813,0.01001,,,0.01937,0.0089,0.03152,0.01697,,,,,0.01552,0.01833,0.0088,0.00885,0.01113,0.00322,0.0185,0.05778,0.00165,0.01731,0.02553,0.01265,0.00413,0.00751,0.00208,0.00581,0.00932,0.01644,0.00426,0.01641,0.0118,0.00169,0.01417,0.00933,0.0037,0.00538,,0.01315,0.00657,,0.00899,0.00586,,0.01504,,,,,,,0.0061,0.00214
hct,,,,,,0.22058,0.38003,0.06385,0.27816,0.04424,0.17431,0.01661,0.00342,0.01177,0.00401,0.00091,0.00058,0.0041,0.00417,0.00856,5e-05,0.00091,0.00223,0.00632,0.00784,0.00919,0.01944,0.00861,0.00616,0.01336,0.02669,0.00295,0.03407,0.03052,0.02834,0.05614,0.2828,0.13059,0.14078,0.00086,0.0023,0.003,0.02196,0.15522,0.1637,0.00297,0.20992,0.07359,0.12778,0.14278,0.02261,0.03576,0.09469,0.03257,0.05495,0.10317,,0.03971,0.02227,0.02222,,0.04185,0.05594,0.09869,0.03799,0.03411,0.07416,0.18237,0.14565,0.05719,0.00901,0.06491,0.0194,0.05016,,,0.05648,0.08822,,,0.02257,0.01835,0.02251,0.01555,,,,,0.06742,0.05249,0.10015,0.15831,0.01628,0.05507,0.10978,0.02151,0.00372,0.07131,0.10176,0.04964,0.05073,0.0225,0.03373,0.01488,0.20957,0.01246,0.00361,0.12138,0.09697,0.05373,0.06678,0.02214,0.03541,0.02041,,0.0304,0.00461,,0.04587,0.0041,,0.06016,,,,,,,0.07108,0.016


- finding index of feature columns with correlation greater than `0.95`

In [43]:
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.95)]

In [44]:
high_corr_features

['cva']

In [45]:
upper[upper['cva'] > 0.95]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,dischdt_DayOfWeek_Mon,dischdt_DayOfWeek_Tues,dischdt_DayOfWeek_Thurs,dischdt_DayOfWeek_Fri,dischdt_DayOfWeek_Sat,dischdt_DayOfWeek_Sun,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_SHORT,arrhythwhen_LONG
cvawhen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.01824,0.01486,0.03244,0.03107,,,,,0.02535,0.00709,0.03748,0.03995,0.00629,0.0165,0.02847,0.00265,0.00759,0.02276,0.02193,0.00679,0.03419,0.01546,0.00563,0.00345,0.01971,0.00035,0.00238,0.6314,0.98202,0.12891,0.10898,0.04839,0.05151,0.03439,,0.03672,0.04631,,0.03526,0.0488,,0.04434,,,,,,,0.0543,0.0286


- `cva` is highly correlated to `cvawhen` - not a surprise, but may consider dropping one of those features
- find out exactly what `cvawhen` is

### Removing Irrelevant Features for Classification

- Albon 10.4, pages 174-176
- for `categorical` variables, calculate a `chi-square` statistic between each `feature` and the `target` vector
- for `quantitative` variables, compute the `ANOVA F-value` between each `feature` and the `target` vector

- if we want the top 75% of features, make `k = 0.75 * number of features`

In [46]:
0.75* cat_features_df.shape[1]

89.25

- creating `target` vector

In [47]:
target = data.copy()['strokeBin']

In [48]:
target.shape

(19756,)

In [49]:
cat_features_df.shape

(19756, 119)

- selecting the top `90` features with the highest `chi-squared` statistics
- `chi-square` statistics examines the independence of two categorical vectors - that is, the statistic is the difference between the observed number of observations in each class of a categorical feature and what we would expect if that feature was independent (i.e., no relationship) with the target vector
- A `chi-square` statistic is a single number that tells you how much difference exists between your observed counts and the counts you would expect if there were no relationship at all in the population - by calculating the `chi-square` statistic between a feature and the target, we obtain a measurement of the independence between the two
- If the target is independent of the feature variable, then it is irrelevant for our purposes because it contains no information we can use for classification.
- On the other hand, if the two features are highly dependent, they likely are very informative for training models.

In [50]:
chi2_selector = SelectKBest(chi2, k=90)

In [51]:
cat_features_kbest = chi2_selector.fit_transform(cat_features_df, target)

- getting the names of the features selected

In [52]:
cat_kbest_feature_mask = chi2_selector.get_support()

In [53]:
cat_kbest_feature_mask.shape

(119,)

In [54]:
cat_kbest_features_df = pd.DataFrame(cat_features_df.columns.tolist(), columns=["Feature"])

In [55]:
cat_kbest_features_df.head()

Unnamed: 0,Feature
0,surgdt_month_Jan
1,surgdt_month_Feb
2,surgdt_month_Mar
3,surgdt_month_Apr
4,surgdt_month_May


In [56]:
cat_kbest_features_df.shape

(119, 1)

- now use the `cat_kbest_feature_mask`

In [57]:
cat_kbest_features_df = cat_kbest_features_df[cat_kbest_feature_mask == True]

- can turn this to a list to select features from main dataframe
- `cat_kbest_features` = `list(cat_kbest_features_df['Feature'].values)`
- `type(cat_kbest_features_df['Feature'].values)` yields a `numpy.ndarray`

In [58]:
cat_kbest_features_df.shape

(90, 1)

In [59]:
cat_kbest_features_df.head()

Unnamed: 0,Feature
0,surgdt_month_Jan
1,surgdt_month_Feb
2,surgdt_month_Mar
3,surgdt_month_Apr
4,surgdt_month_May


#### Let's look closely at the `chi-square` scores using the `scores_` and `pvalues_` attributes for `chi2_selector` object

In [60]:
cat_chi2_summary = pd.DataFrame(list(zip(cat_features_df.columns.tolist(),
                                         chi2_selector.scores_,
                                         chi2_selector.pvalues_)),
                                columns=['Feature', 'chi-square_statistic', 'pvalue'])

In [61]:
cat_chi2_summary.head()

Unnamed: 0,Feature,chi-square_statistic,pvalue
0,surgdt_month_Jan,1.43775,0.2305
1,surgdt_month_Feb,0.24548,0.62028
2,surgdt_month_Mar,0.94788,0.33026
3,surgdt_month_Apr,0.9278,0.33544
4,surgdt_month_May,0.3832,0.5359


- assuming a critical value of 0.05, then if `pvalue` `<=` `0.05` then we can reject the null hypothesis that the feature and target are independent -- which is what we want

#### Finding features where `pvalue` `<=` `0.05`

In [62]:
sig_cat_features = cat_chi2_summary[cat_chi2_summary['pvalue'] <= 0.05].sort_values(by=['pvalue'],
                                                                                    ascending=True)

In [63]:
sig_cat_features.shape

(32, 3)

In [64]:
sig_cat_features

Unnamed: 0,Feature,chi-square_statistic,pvalue
96,cvd,32.99659,9.23207e-09
80,classnyh_REST,24.90763,6.01438e-07
108,cvdstenlft_100%,22.1414,2.53288e-06
97,cva,19.92291,8.06284e-06
66,cvawhen,18.09469,2.10187e-05
56,chf,14.80831,0.00011901
83,vdinsufm_MODERATE,13.85015,0.000197981
44,pvd,12.90752,0.000327265
86,vdinsuft_MILD,12.74477,0.000357006
19,dischdt_DayOfWeek_Mon,10.31711,0.00131802


- confirming that `significant` features were in `SelectKBest`

In [65]:
in_KBest = []

for feature in sig_cat_features['Feature'].values.tolist():
    if feature in cat_kbest_features_df.values:
        in_KBest.append(1)
    else:
        in_KBest.append(0)

In [66]:
sig_cat_features['in_KBest'] = in_KBest

In [67]:
sig_cat_features

Unnamed: 0,Feature,chi-square_statistic,pvalue,in_KBest
96,cvd,32.99659,9.23207e-09,1
80,classnyh_REST,24.90763,6.01438e-07,1
108,cvdstenlft_100%,22.1414,2.53288e-06,1
97,cva,19.92291,8.06284e-06,1
66,cvawhen,18.09469,2.10187e-05,1
56,chf,14.80831,0.00011901,1
83,vdinsufm_MODERATE,13.85015,0.000197981,1
44,pvd,12.90752,0.000327265,1
86,vdinsuft_MILD,12.74477,0.000357006,1
19,dischdt_DayOfWeek_Mon,10.31711,0.00131802,1


- confirming

In [68]:
sig_cat_features['in_KBest'].sum() - sig_cat_features.shape[0]

0

#### Now let's use the same coding pattern with the numerical features, except using `ANOVA F-value` between each numerical feature and target vector

In [69]:
numerical_df.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,length_stay
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,5
1,65,175.3,79.4,25.83787,45.0,1.2,3.8,5.9,7.5,55.0,40.0,7
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,8
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,4
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,4


In [70]:
numerical_df.shape

(19756, 12)

In [71]:
target.shape

(19756,)

- instantiating a `SelectKBest` object

In [72]:
fvalue_selector = SelectKBest(f_classif, k=6)

In [73]:
num_features_kbest = fvalue_selector.fit_transform(numerical_df, target)

- getting the names of the features selected

In [74]:
num_kbest_feature_mask = fvalue_selector.get_support()

In [75]:
num_kbest_feature_mask[0:5]

array([ True,  True,  True, False,  True])

In [76]:
num_kbest_feature_mask.shape

(12,)

- creating a `dataframe` of the numerical feature names

In [77]:
num_kbest_features_df = pd.DataFrame(numerical_df.columns.tolist(), columns=["Feature"])

- now use the `cat_kbest_feature_mask`

In [78]:
num_kbest_features_df = num_kbest_features_df[num_kbest_feature_mask == True]

- yields the `6` best features

In [79]:
num_kbest_features_df

Unnamed: 0,Feature
0,age
1,heightcm
2,weightkg
4,hct
10,pasys
11,length_stay


#### Let's look closely at the `F-values` scores using the `scores_` and `pvalues_` attributes for `fvalue_selector` object

In [80]:
num_fvalue_summary = pd.DataFrame(list(zip(numerical_df.columns.tolist(),
                                           fvalue_selector.scores_,
                                           fvalue_selector.pvalues_)),
                                  columns=['Feature', 'F_value_statistic', 'pvalue'])

In [81]:
num_fvalue_summary

Unnamed: 0,Feature,F_value_statistic,pvalue
0,age,44.78792,2.25462e-11
1,heightcm,18.85836,1.41493e-05
2,weightkg,25.48905,4.48857e-07
3,bmi,5.09254,0.0240399
4,hct,19.45003,1.03812e-05
5,creatlst,3.96882,0.046364
6,totalbumin,16.70362,4.38702e-05
7,a1clvl,2.16525,0.141178
8,meldscr,5.32159,0.0210731
9,hdef,10.02233,0.00154889


#### Finding features where `pvalue` `<=` `0.05`

In [82]:
sig_num_features = num_fvalue_summary[num_fvalue_summary['pvalue'] <= 0.05].sort_values(by=['pvalue'],
                                                                                        ascending=True)

In [83]:
sig_num_features

Unnamed: 0,Feature,F_value_statistic,pvalue
11,length_stay,432.27065,5.4158e-95
0,age,44.78792,2.25462e-11
2,weightkg,25.48905,4.48857e-07
4,hct,19.45003,1.03812e-05
1,heightcm,18.85836,1.41493e-05
10,pasys,17.37037,3.08892e-05
6,totalbumin,16.70362,4.38702e-05
9,hdef,10.02233,0.00154889
8,meldscr,5.32159,0.0210731
3,bmi,5.09254,0.0240399


In [84]:
sig_num_features.shape

(11, 3)

- all `numerical` features are significant