
# Introduction
This notebook makes use of various Machine Learning Models for the problem of predicting heart disease of a person.

This dataset gives a number of variables along with a target condition of having or not having heart disease.

### Importing Libraries

In [2]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

### Importing the Data

In [3]:
heart_data = pd.read_csv("D:\Heart Attack Detection\Data\heart.csv")

Now let's take a look at the data

In [4]:
#Fetching the first 10 rows
heart_data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


### Data Description

In [None]:
heart_data.age

In [7]:
The meanings of some of the columns are unclear, here is what the mean

0      63
1      37
2      41
3      56
4      57
       ..
298    57
299    45
300    68
301    57
302    57
Name: age, Length: 303, dtype: int64

The meanings of some of the columns are unclear, here is what the mean

- **age**: The person's age in years
- **sex**: The person's sex (1 = male, 0 = female)
- **cp:** The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic)
- **trestbps:** The person's resting blood pressure (mm Hg on admission to the hospital)
- **chol:** The person's cholesterol measurement in mg/dl
- **fbs:** The person's fasting blood sugar  (> 120 mg/dl, 1 = true; 0 = false) 
- **restecg:** Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
- **thalach:** The person's maximum heart rate achieved
- **exang:** Exercise induced angina (1 = yes; 0 = no)
- **oldpeak:** ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot. See more [here](https://litfl.com/st-segment-ecg-library/))
- **slope:** the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping)
- **ca:** The number of major vessels (0-3)
- **thal:** A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)
- **target:** Heart disease (0 = no, 1 = yes)

Let's change the column names to be a bit clearer

In [5]:
heart_data.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

# The Models

## Now we split the data to feed into the Models

In [44]:
# Labels are the values we want to predict
labels = np.array(heart_data['target'])
# Remove the labels from the features
# axis 1 refers to the columns
features= heart_data.drop('target', axis = 1)
# Saving feature names for later use
feature_list = list(heart_data.columns)
# Convert to numpy array
features = np.array(features)

In [45]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

#### Preview the data

In [50]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (227, 13)
Training Labels Shape: (227,)
Testing Features Shape: (76, 13)
Testing Labels Shape: (76,)


## Random Forest Classifier

In [65]:
rf_model = RandomForestClassifier(max_depth=5,n_estimators=100,random_state=0)
rf_model.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [67]:
rf_predict = rf_model.predict(test_features)
rf_pred_quant = rf_model.predict_proba(test_features)[:, 1]
rf_pred_bin = rf_model.predict(test_features)

#### Finding out the Accuracy of the Random Forest Classifier

Diagnostic tests are often sold, marketed, cited and used with **sensitivity** and **specificity** as the headline metrics. Sensitivity and specificity are defined as,

\begin{align}
Sensitivity = \frac{True\:Positives}{True\:Positives + False\:Negatives}
\end{align}

\begin{align}
Specificity = \frac{True\:Negatives}{True\:Negatives + False\:Positives}
\end{align}

In [69]:
from sklearn.metrics import confusion_matrix #for model evaluation
confusion_matrix = confusion_matrix(test_labels, rf_pred_bin)
confusion_matrix

array([[27,  8],
       [ 4, 37]], dtype=int64)

In [70]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.8709677419354839
Specificity :  0.8222222222222222


## Stochastic gradient Descent Classifier

In [78]:
sgd_model = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd_model.fit(train_features, train_labels)
sgd_predict = sgd_model.predict(test_features)

In [79]:
sgd_predict = sgd_model.predict(test_features)
#sgd_pred_quant = sgd_model.predict_proba(test_features)[:, 1]
sgd_pred_bin = sgd_model.predict(test_features)

In [80]:
from sklearn.metrics import confusion_matrix #for model evaluation
confusion_matrix = confusion_matrix(test_labels, sgd_pred_bin)
confusion_matrix

array([[15, 20],
       [ 3, 38]], dtype=int64)

In [81]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.8333333333333334
Specificity :  0.6551724137931034


## Logistic Regression

In [85]:
logreg_model = LogisticRegression()
logreg_model.fit(train_features, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [87]:
logreg_predict = logreg_model.predict(test_features)
logreg_pred_quant = logreg_model.predict_proba(test_features)[:, 1]
logreg_pred_bin = logreg_model.predict(test_features)

In [94]:
from sklearn.metrics import confusion_matrix #for model evaluation
confusion_matrix = confusion_matrix(test_labels, logreg_pred_bin)
confusion_matrix

array([[29,  6],
       [ 4, 37]], dtype=int64)

In [89]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.8787878787878788
Specificity :  0.8604651162790697


## K Nearest Neighbors

In [91]:
#KNN 
knn_model = KNeighborsClassifier(n_neighbors = 3) 
knn_model.fit(train_features, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [96]:
knn_predict = knn_model.predict(test_features)
knn_pred_bin = knn_model.predict(test_features)

In [98]:
from sklearn.metrics import confusion_matrix #for model evaluation
confusion_matrix = confusion_matrix(test_labels, knn_pred_bin)
confusion_matrix

array([[24, 11],
       [14, 27]], dtype=int64)

In [99]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.631578947368421
Specificity :  0.7105263157894737


## Gaussian Naive Bayes

In [100]:
gaussian_model = GaussianNB() 
gaussian_model.fit(train_features, train_labels)

GaussianNB(priors=None, var_smoothing=1e-09)

In [101]:
gaussian_predict = gaussian_model.predict(test_features)
gaussian_pred_bin = gaussian_model.predict(test_features)

In [103]:
from sklearn.metrics import confusion_matrix #for model evaluation
confusion_matrix = confusion_matrix(test_labels, gaussian_pred_bin)
confusion_matrix

array([[31,  4],
       [ 5, 36]], dtype=int64)

In [104]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.8611111111111112
Specificity :  0.9


## Perceptron

In [105]:
perceptron_model = Perceptron(max_iter=5)
perceptron_model.fit(train_features, train_labels)



Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=5, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [106]:
perceptron_predict = perceptron_model.predict(test_features)
perceptron_pred_bin = perceptron_model.predict(test_features)

In [107]:
from sklearn.metrics import confusion_matrix #for model evaluation
confusion_matrix = confusion_matrix(test_labels, perceptron_pred_bin)
confusion_matrix

array([[14, 21],
       [ 3, 38]], dtype=int64)

In [108]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity :  0.8235294117647058
Specificity :  0.6440677966101694


# Storing all the models

In [109]:
import joblib

In [110]:
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(sgd_model, "sgd_model.pkl")
joblib.dump(logreg_model, "logreg_model.pkl")
joblib.dump(knn_model, "knn_model.pkl")
joblib.dump(gaussian_model, "gaussian_model.pkl")
joblib.dump(perceptron_model, "perceptron_model.pkl")

['perceptron_model.pkl']