Heart Disease Prediction by Aaron Xu

**I. Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import os
print(os.listdir())

import warnings
warnings.filterwarnings('ignore')

**II. Looking and Understanding the dataset**

In [None]:
dataset = pd.read_csv("heart.csv")

Checking if the dataset is considered a 'dataframe' by pandas

In [None]:
type(dataset)

Shape of dataset

In [None]:
dataset.shape

Printing out a sample of the dataset

In [None]:
dataset.head(5)

In [None]:
dataset.sample(5)

Description

In [None]:
dataset.describe()

In [None]:
dataset.info()

Understanding what our data represents

In [None]:
info = ["age","1: male, 0: female","chest pain type, 1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic","resting blood pressure"," serum cholestoral in mg/dl","fasting blood sugar > 120 mg/dl","resting electrocardiographic results (values 0,1,2)"," maximum heart rate achieved","exercise induced angina","oldpeak = ST depression induced by exercise relative to rest","the slope of the peak exercise ST segment","number of major vessels (0-3) colored by flourosopy","thal: 3 = normal; 6 = fixed defect; 7 = reversable defect"]

for i in range(len(info)):
    print(dataset.columns[i] + ":\t\t\t" + info[i])

Analysing the 'target' variable

In [None]:
dataset["target"].describe()

In [None]:
dataset["target"].unique()

Because the targets are 0's and 1's this is a Classification problem

Checking correlation between columns

In [None]:
print(dataset.corr()["target"].abs().sort_values(ascending = False))

**Exploratory Data Analysis (EDA)**

Analysing the target variable

In [None]:
y = dataset["target"]
sns.countplot(y)
target_temp = dataset.target.value_counts()
print(target_temp)

In [None]:
print("Percentage of patience without heart problems: " + str(round(target_temp[0] * 100/303, 2)))
print("Percentage of patience with heart problems: " + str(round(target_temp[1] * 100/303, 2)))

**III. Go on to analyse the other features of 'sex', 'cp', 'fbs', restecg', 'exang', 'slope', 'ca', and 'thal'

In [None]:
dataset["sex"].unique()

In [None]:
sns.barplot(dataset["sex"], y)

In [None]:
dataset["cp"].unique()

In [None]:
sns.barplot(dataset["cp"], y)

In [None]:
dataset["fbs"].describe()

In [None]:
dataset["fbs"].unique()

In [None]:
sns.barplot(dataset["fbs"], y)

In [None]:
dataset["restecg"].unique()

In [None]:
sns.barplot(dataset["restecg"], y)

In [None]:
dataset["exang"].unique()

In [None]:
sns.barplot(dataset["exang"]), y

In [None]:
dataset["slope"].unique()

In [None]:
sns.barplot(dataset["slope"], y)

In [None]:
dataset["ca"].unique()

In [None]:
sns.countplot(dataset["ca"])

In [None]:
sns.barplot(dataset["ca"], y)

In [None]:
dataset["thal"].unique()

In [None]:
sns.barplot(dataset["thal"], y)

In [None]:
sns.distplot(dataset["thal"])

**IV. Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split

predictors = dataset.drop("target", axis = 1)
target = dataset["target"]

X_train,X_test,Y_train,Y_test = train_test_split(predictors, target, test_size = 0.20, random_state = 0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape

**V. Model Fitting**

In [None]:
from sklearn.metrics import accuracy_score

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred_lr = lr.predict(X_test)

In [None]:
Y_pred_lr.shape

In [None]:
score_lr = round(accuracy_score(Y_pred_lr, Y_test) * 100, 2)
print("The accuracy score achieved using Logistic Regression is: " + str(score_lr) + " %")

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, Y_train)
Y_pred_nb = nb.predict(X_test)

In [None]:
Y_pred_nb.shape

In [None]:
score_nb = round(accuracy_score(Y_pred_nb, Y_test) * 100, 2)
print("The accuracy score achieved using Naive Bayes is " + str(score_nb) + " %")

SVM

In [None]:
from sklearn import svm
sv = svm.SVC(kernel = 'linear')
sv.fit(X_train, Y_train)
Y_pred_svm = sv.predict(X_test)

In [None]:
Y_pred_svm.shape

In [None]:
score_svm = round(accuracy_score(Y_pred_svm, Y_test) * 100, 2)
print("The accuracy score achieved using Linear SVM is: " + str(score_svm) + " %")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)

In [None]:
Y_pred_knn.shape

In [None]:
score_knn = round(accuracy_score(Y_pred_knn, Y_test) * 100, 2)
print("The accuracy score achieved using KNN is: " + str(score_knn) + " %")

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
max_accuracy = 0

for x in range(200):
    dt = DecisionTreeClassifier(random_state = x)
    dt.fit(X_train, Y_train)
    Y_pred_dt = dt.predict(X_test)
    current_accuracy = round(accuracy_score(Y_pred_dt, Y_test) * 100, 2)
    if(current_accuracy > max_accuracy):
        max_accuracy = current_accuracy
        best_x = x

dt = DecisionTreeClassifier(random_state = best_x)
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)

In [None]:
print(Y_pred_dt.shape)

In [None]:
score_dt = round(accuracy_score(Y_pred_dt, Y_test) * 100, 2)
print("The accuracy score achieved using Decision Tree is: " + str(score_dt) + " %")

Random Forest

In [None]:
from sklearn.ensemble import RandomForstClassifier
max_accuracy = 0

for x in range(2000):
    rf = RandomForestClassifier(random_state = x)
    rf.fit(X_train, Y_train)
    Y_pred_rf = rf.predict(X_test)
    current_accuracy = round(accuracy_score(Y_pred_rf, Y_test) * 100, 2)
    if(current_accuracy > max_accuracy):
        max_accuracy = current_accuracy
        best_x = x

rf = RandomForestClassifier(random_state = best_x)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

In [None]:
Y_pred_rf.shape

In [None]:
score_rf = round(accuracy_score(Y_pred_rf, Y_test) * 100, 2)
print("The accuracy score achieved using Random Forest is: " + str(score_rf) + " %")

XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective = "binary:logistic", random_state = 42)
xgb_model.fit(X_train, Y_train)

Y_pred_xgb = xgb_model.predict(X_test)

In [None]:
Y_pred_xgb.shape

In [None]:
score_xgb = round(accuracy_score(Y_pred_xgb, Y_test) * 100, 2)
print("The accuracy score achieved using XGBoost is: " + str(score_xgb) + " %")

Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
model = Sequential()
model.add(Dense(11, activation = 'relu', input_dim = 13))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs = 300)

In [None]:
Y_pred_nn = model.predict(X_test)

In [None]:
Y_pred_nn.shape

In [None]:
rounded = [round(x[0]) for x in Y_pred_nn]
Y_pred_nn = rounded

In [None]:
score_nn = round(accuracy_score(Y_pred_nn, Y_test) * 100, 2)
print("The accuracy score achieved using Neural Network is: " + str(score_nn) + " %")

**VI. Output of Accuracy Scores**

In [None]:
scores = [score_lr, score_nb, score_svm, score_knn, score_dt, score_rf, score_xgb, score_nn]
algorithms = ["Logistic Regression", "Naive Bayes", "Support Vector Machine", "K-Nearest Neighbors", "Decision Tree", "Random Forest", "XGBoost", "Neural Network"]    

for i in range(len(algorithms)):
    print("The accuracy score achieved using " + algorithms[i] + " is: " + str(scores[i]) + " %")

In [None]:
sns.set(rc = {'figure.figsize':(15, 8)})
plt.xlabel("Algorithms")
plt.ylabel("Accuracy score")
sns.barplot(algorithms, scores)