# Analysis

In [112]:
import os

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns

In [6]:
os.chdir('../data')
DATA_DIR = os.getcwd()

## Convert data to proper dtypes

In [7]:
float16_cols = ['oldpeak_eq_st_depression']
categorical_cols = ['thal']
uint8_cols = ['sex', 'fasting_blood_sugar_gt_120_mg_per_dl', 'exercise_induced_angina', 'slope_of_peak_exercise_st_segment', 'resting_blood_pressure', 'chest_pain_type', 'num_major_vessels', 'resting_ekg_results', 'age', 'max_heart_rate_achieved']
uint16_cols = ['serum_cholesterol_mg_per_dl']
dtype_dict = {}
for c in float16_cols:
    dtype_dict[c] = np.float16
for c in categorical_cols:
    dtype_dict[c] = np.object
for c in uint8_cols:
    dtype_dict[c] = np.uint8
for c in uint16_cols:
    dtype_dict[c] = np.uint16

In [82]:
df_x = pd.read_csv(DATA_DIR + '/raw/train_values.csv', dtype=dtype_dict)
df_y = pd.read_csv(DATA_DIR + '/raw/train_labels.csv', dtype=dtype_dict)
df_x_test = pd.read_csv(DATA_DIR + '/raw/test_values.csv', dtype=dtype_dict)

In [27]:
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null uint8
thal                                    180 non-null object
resting_blood_pressure                  180 non-null uint8
chest_pain_type                         180 non-null uint8
num_major_vessels                       180 non-null uint8
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null uint8
resting_ekg_results                     180 non-null uint8
serum_cholesterol_mg_per_dl             180 non-null uint16
oldpeak_eq_st_depression                180 non-null float16
sex                                     180 non-null uint8
age                                     180 non-null uint8
max_heart_rate_achieved                 180 non-null uint8
exercise_induced_angina                 180 non-null uint8
dtypes: float16(1), object(2), uint16(1), uint8(10)
memory usage

## Scale numerical data with MinMaxScaler() 

In [55]:
cols_for_scaler = ['age', 'max_heart_rate_achieved', 'oldpeak_eq_st_depression', 'resting_blood_pressure', 'serum_cholesterol_mg_per_dl']

In [83]:
min_max_scaler = MinMaxScaler(copy=False)
df_x[cols_for_scaler] = min_max_scaler.fit_transform(df_x[cols_for_scaler])
df_x_test[cols_for_scaler] = min_max_scaler.transform(df_x_test[cols_for_scaler])



  return self.partial_fit(X, y)


In [84]:
# Even though the data set is small, keep memory footprint low
df_x[cols_for_scaler] = df_x[cols_for_scaler].apply(pd.to_numeric, downcast='float')
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null uint8
thal                                    180 non-null object
resting_blood_pressure                  180 non-null float32
chest_pain_type                         180 non-null uint8
num_major_vessels                       180 non-null uint8
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null uint8
resting_ekg_results                     180 non-null uint8
serum_cholesterol_mg_per_dl             180 non-null float32
oldpeak_eq_st_depression                180 non-null float32
sex                                     180 non-null uint8
age                                     180 non-null float32
max_heart_rate_achieved                 180 non-null float32
exercise_induced_angina                 180 non-null uint8
dtypes: float32(5), object(2), uint8(7)
memory usage: 7.6

In [34]:
df_x.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,normal,0.395349,2,0,0,2,0.415525,0.0,1,0.333333,0.698113,0
1,ryoo3j,2,normal,0.186047,3,0,0,0,0.200913,0.258065,0,0.520833,0.584906,0
2,yt1s1x,1,normal,0.360465,4,3,0,2,0.406393,0.0,1,1.0,0.622642,1
3,l2xjde,1,reversible_defect,0.674419,4,0,0,0,0.221461,0.0,1,0.229167,0.801887,0
4,oyt4ek,3,reversible_defect,0.976744,1,0,0,2,0.328767,0.677419,1,0.625,0.462264,0


## One-hot encode categorical data

In [65]:
label_enc_cols = ['thal']
one_hot_cols = ['slope_of_peak_exercise_st_segment', 'num_major_vessels', 'chest_pain_type', 'resting_ekg_results', 'sex', 'thal']

In [85]:
label_encoder = LabelEncoder()
label_encoder.fit(['normal', 'fixed_defect', 'reversible_defect'])

LabelEncoder()

In [86]:
# encode the 'thal' labels as integers
df_x[label_enc_cols] = label_encoder.transform(df_x[label_enc_cols].values.ravel())
df_x_test[label_enc_cols] = label_encoder.transform(df_x_test[label_enc_cols].values.ravel())

In [87]:
df_x.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,1,0.395349,2,0,0,2,0.415525,0.0,1,0.333333,0.698113,0
1,ryoo3j,2,1,0.186047,3,0,0,0,0.200913,0.258065,0,0.520833,0.584906,0
2,yt1s1x,1,1,0.360465,4,3,0,2,0.406393,0.0,1,1.0,0.622642,1
3,l2xjde,1,2,0.674419,4,0,0,0,0.221461,0.0,1,0.229167,0.801887,0
4,oyt4ek,3,2,0.976744,1,0,0,2,0.328767,0.677419,1,0.625,0.462264,0


In [88]:
# one-hot encode
df_x = pd.get_dummies(df_x, columns=one_hot_cols)
df_x_test = pd.get_dummies(df_x_test, columns=one_hot_cols)

In [89]:
df_x.head()

Unnamed: 0,patient_id,resting_blood_pressure,fasting_blood_sugar_gt_120_mg_per_dl,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,age,max_heart_rate_achieved,exercise_induced_angina,slope_of_peak_exercise_st_segment_1,slope_of_peak_exercise_st_segment_2,...,chest_pain_type_3,chest_pain_type_4,resting_ekg_results_0,resting_ekg_results_1,resting_ekg_results_2,sex_0,sex_1,thal_0,thal_1,thal_2
0,0z64un,0.395349,0,0.415525,0.0,0.333333,0.698113,0,1,0,...,0,0,0,0,1,0,1,0,1,0
1,ryoo3j,0.186047,0,0.200913,0.258065,0.520833,0.584906,0,0,1,...,1,0,1,0,0,1,0,0,1,0
2,yt1s1x,0.360465,0,0.406393,0.0,1.0,0.622642,1,1,0,...,0,1,0,0,1,0,1,0,1,0
3,l2xjde,0.674419,0,0.221461,0.0,0.229167,0.801887,0,1,0,...,0,1,1,0,0,0,1,0,0,1
4,oyt4ek,0.976744,0,0.328767,0.677419,0.625,0.462264,0,0,0,...,0,0,0,0,1,0,1,0,0,1


In [91]:
df_x_test.shape

(90, 27)

# Train models


## Logistic Regression

In [94]:
log_reg = LogisticRegression(random_state=0, solver='liblinear')


In [102]:
X = df_x.drop('patient_id', axis=1)
y = df_y.drop('patient_id', axis=1).values.ravel()

In [103]:
log_reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [104]:
log_reg.score(X, y)

0.8666666666666667

## Random Forest

In [105]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [106]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [107]:
rf.score(X, y)

0.8388888888888889

# Naive Bayes

In [108]:
gnb = GaussianNB()

In [109]:
gnb.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [110]:
gnb.score(X, y)

0.7833333333333333

In [116]:
rs = 0
names = ["QDA", "AdaBoost", "RF", "Gaussian Process", "Logistic Regression", "NB", "Nearest Neighbors", "MLP",
         "Linear SVM", "RBF SVM", "Decision Tree"]
classifiers = [
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(random_state=rs),
    RandomForestClassifier(max_depth=5, n_estimators=10, random_state=rs),
    GaussianProcessClassifier(random_state=rs, kernel=RBF(1.0)),
    LogisticRegression(random_state=rs, solver='liblinear'),
    GaussianNB(),
    KNeighborsClassifier(3),
    MLPClassifier(alpha=1, random_state=rs),
    SVC(kernel='linear', C=0.025, random_state=rs),
    SVC(gamma=2, C=1, random_state=rs),
    DecisionTreeClassifier(max_depth=5, random_state=rs)
]

In [117]:
for name, clf in zip(names, classifiers):
    clf.fit(X, y)
    score = clf.score(X, y)
    print('='*20)
    print(name + ': {}'.format(score))



QDA: 0.6555555555555556
AdaBoost: 0.95
RF: 0.9444444444444444
Gaussian Process: 0.8888888888888888
Logistic Regression: 0.8666666666666667
NB: 0.7833333333333333
Nearest Neighbors: 0.9111111111111111
MLP: 0.9055555555555556
Linear SVM: 0.8388888888888889
RBF SVM: 0.9722222222222222
Decision Tree: 0.9333333333333333


