In [25]:
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier

In [2]:
# https://www.kaggle.com/ronitf/heart-disease-uci
df = pd.read_csv('dataset_heart_disease.csv')

In [3]:
# > 1. age
# > 2. sex
# > 3. chest pain type (4 values)
# > 4. resting blood pressure
# > 5. serum cholestoral in mg/dl
# > 6. fasting blood sugar > 120 mg/dl
# > 7. resting electrocardiographic results (values 0,1,2)
# > 8. maximum heart rate achieved
# > 9. exercise induced angina
# > 10. oldpeak = ST depression induced by exercise relative to rest
# > 11. the slope of the peak exercise ST segment
# > 12. number of major vessels (0-3) colored by flourosopy
# > 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [4]:
#renaming a few columns to better describe data
df = df.rename(columns={'cp': 'pain_type', 'trestbps': 'resting_bp', 'chol': 'cholesteral', 'fbs': 'fast_blood_sug',
                  'thalach':'max_heart_rate', 'exang': 'exercise_angina', 'ca': 'colored_vessels'})

In [5]:
df.head()

Unnamed: 0,age,sex,pain_type,resting_bp,cholesteral,fast_blood_sug,restecg,max_heart_rate,exercise_angina,oldpeak,slope,colored_vessels,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
df.dtypes

age                  int64
sex                  int64
pain_type            int64
resting_bp           int64
cholesteral          int64
fast_blood_sug       int64
restecg              int64
max_heart_rate       int64
exercise_angina      int64
oldpeak            float64
slope                int64
colored_vessels      int64
thal                 int64
target               int64
dtype: object

In [6]:
df.describe()

Unnamed: 0,age,sex,pain_type,resting_bp,cholesteral,fast_blood_sug,restecg,max_heart_rate,exercise_angina,oldpeak,slope,colored_vessels,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


# Data exploration

In [7]:
abs(df.corr()['target']).sort_values(ascending=False)

target             1.000000
exercise_angina    0.436757
pain_type          0.433798
oldpeak            0.430696
max_heart_rate     0.421741
colored_vessels    0.391724
slope              0.345877
thal               0.344029
sex                0.280937
age                0.225439
resting_bp         0.144931
restecg            0.137230
cholesteral        0.085239
fast_blood_sug     0.028046
Name: target, dtype: float64

# Random Forest

In [29]:
rfc = ensemble.RandomForestClassifier()
X = df.drop('target', axis=1)
y = df['target']

cv_rf = cross_val_score(rfc, X, y, cv=10)



In [30]:
cv_rf.mean()

0.8044048943270301

We can see that the model tends to overfit as the range of cross validation scores is high. 


# MLP

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100))
mlp.fit(X, y)
cross_val_score(mlp, X, y, cv=10)



array([0.83870968, 0.61290323, 0.80645161, 0.87096774, 0.87096774,
       0.7       , 0.66666667, 0.76666667, 0.55172414, 0.75862069])

In [23]:
mlp = MLPClassifier(hidden_layer_sizes=(1000))
mlp.fit(X, y)
cross_val_score(mlp, X, y, cv=10)

array([0.70967742, 0.70967742, 0.74193548, 0.87096774, 0.64516129,
       0.66666667, 0.83333333, 0.86666667, 0.65517241, 0.72413793])

In [26]:
layers = [1000, (1000,1000), (1000,1000,1000)]
solvers =['lbfgs', 'sgd', 'adam']
learn_rate = ['constant', 'invscaling', 'adaptive']
tuned_parameters = [{'hidden_layer_sizes': layers, 'solver': solvers, 'learning_rate': learn_rate}]
n_folds = 5

clf = GridSearchCV(mlp, tuned_parameters, cv=n_folds, refit=False)
clf.fit(X, y)

clf.best_params_



{'hidden_layer_sizes': 1000, 'learning_rate': 'adaptive', 'solver': 'adam'}

In [27]:
mlp = MLPClassifier(hidden_layer_sizes=(1000), learning_rate='adaptive', solver='adam')
mlp.fit(X, y)
cross_val_score(mlp, X, y, cv=10)

array([0.90322581, 0.77419355, 0.80645161, 0.87096774, 0.87096774,
       0.56666667, 0.83333333, 0.9       , 0.68965517, 0.75862069])

As we can see even with the best hyperparameters found by gridsearch in this case the defaul random forest algorithm performs about the same and provides the results in a much shorter time. 

In [34]:
mlp_cv_scores = np.array([0.90322581, 0.77419355, 0.80645161, 0.87096774, 0.87096774,
       0.56666667, 0.83333333, 0.9       , 0.68965517, 0.75862069])

In [35]:
mlp_cv_scores.mean()

0.797408231