In [300]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [301]:
#import dependencies

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
sns.set(style="whitegrid", color_codes=True, font_scale=1.3)

In [None]:
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
from site_key import pw

In [None]:
#create engine and connection to postgres
engine = create_engine(f'postgresql://postgres:{pw}@localhost:5432/heart_failure')
connection = engine.connect()

In [302]:
#read table from heart_failure DB
heart = pd.read_sql('select * from heart_failure', connection)
heart.head(5)


# heart = pd.read_csv('Resources/heart.csv')
# heart.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [303]:
# Find if null data exists
heart.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [304]:
#renaming columns
heart.rename(columns={'Age': 'PatientAge', 'Sex': 'Gender', 'ChestPainType': 'ChestPainType', 'RestingBP': 'BloodPressure', 
                      'Cholesterol': 'Cholesterol', 'FastingBS':'BloodSugar', 'RestingECG':'Electrocardiogram',
                      'MaxHR': 'MaxHeartRate', 'ExerciseAngina': 'ExerciseAngina', 'OldPeak': 'OldPeak', 
                      'ST_Slope': "ST_Slope", 'HeartDisease': 'HeartDisease' }, inplace=True)

In [305]:
# Transform Objects - VS
def Gender_Num(Gender):
    if Gender == "F":
        return 1
    else:
        return 0


def ChestPainType_Num(ChestPainType):
    if ChestPainType == "ASY":
        return 0
    elif ChestPainType == "ATA":
        return 1
    elif ChestPainType == "NAP":
        return 2
    else:
        return 3
        #TA


def Electrocardiogram_Num(Electrocardiogram):
    if Electrocardiogram == "Normal":
        return 0
    elif Electrocardiogram == "ST":
        return 1
    else:
        return 2
        #LVH


def ExerciseAngina_Num(ExerciseAngina):
    if ExerciseAngina == "Y":
        return 1
    else:
        return 0
        #No


def ST_Slope_Num(ST_Slope):
    if ST_Slope == "Down":
        return -1
    elif ST_Slope == "Flat":
        return 0
    else:
        return 1
        #Up


heart["Gender"] = heart["Gender"].apply(Gender_Num)
heart["ChestPainType"] = heart["ChestPainType"].apply(ChestPainType_Num)
heart["Electrocardiogram"] = heart["Electrocardiogram"].apply(Electrocardiogram_Num)
heart["ExerciseAngina"] = heart["ExerciseAngina"].apply(ExerciseAngina_Num)
heart["ST_Slope"] = heart["ST_Slope"].apply(ST_Slope_Num)

heart.head()

Unnamed: 0,PatientAge,Gender,ChestPainType,BloodPressure,Cholesterol,BloodSugar,Electrocardiogram,MaxHeartRate,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289,0,0,172,0,0.0,1,0
1,49,1,2,160,180,0,0,156,0,1.0,0,1
2,37,0,1,130,283,0,1,98,0,0.0,1,0
3,48,1,0,138,214,0,0,108,1,1.5,0,1
4,54,0,2,150,195,0,0,122,0,0.0,1,0


In [306]:
#VS
heart.dtypes

PatientAge             int64
Gender                 int64
ChestPainType          int64
BloodPressure          int64
Cholesterol            int64
BloodSugar             int64
Electrocardiogram      int64
MaxHeartRate           int64
ExerciseAngina         int64
Oldpeak              float64
ST_Slope               int64
HeartDisease           int64
dtype: object

In [307]:
#check to see how the data is distributed
heart.describe()

Unnamed: 0,PatientAge,Gender,ChestPainType,BloodPressure,Cholesterol,BloodSugar,Electrocardiogram,MaxHeartRate,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,0.21024,0.781046,132.396514,198.799564,0.233115,0.603486,136.809368,0.404139,0.887364,0.361656,0.553377
std,9.432617,0.407701,0.956519,18.514154,109.384145,0.423046,0.805968,25.460334,0.490992,1.06657,0.607056,0.497414
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,-1.0,0.0
25%,47.0,0.0,0.0,120.0,173.25,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,0.0,130.0,223.0,0.0,0.0,138.0,0.0,0.6,0.0,1.0
75%,60.0,0.0,2.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,1.0,1.0


In [308]:
fig = px.strip(heart, x="ExerciseAngina", y="MaxHeartRate", facet_col="HeartDisease", color="HeartDisease")
fig.show()

Patients diagnosed with Heart Disease are more likely to have lower Heart Rates and Chest Pain when Exercising. 

In [309]:
fig = px.strip(heart, x="Oldpeak", y="MaxHeartRate", facet_col="ST_Slope", color="HeartDisease")
fig.show()

Patients diagnosed with Heart Disease will most likely produce abnormal Oldpeak readings on an ECG (Oldpeak >0) and flat ST_Slopes in their heart rate (ST_Slope =0) 

In [310]:
#Percentage of NON-heart disease cases
data_0 = heart[heart.HeartDisease == 0].HeartDisease.count() / heart.HeartDisease.count()
data_0

0.4466230936819172

In [311]:
#Percentage of heart disease cases
data_1= heart[heart.HeartDisease == 1].HeartDisease.count() / heart.HeartDisease.count()
data_1

0.5533769063180828

In [312]:
# Seperating Dependent Features from other independent features ---
X = heart.drop(columns=['HeartDisease'], axis=1)
y = heart['HeartDisease']


# Split the data into 80% training and 20% testing (80:20 ratio) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
X_train.head()


Unnamed: 0,PatientAge,Gender,ChestPainType,BloodPressure,Cholesterol,BloodSugar,Electrocardiogram,MaxHeartRate,ExerciseAngina,Oldpeak,ST_Slope
425,60,0,1,160,267,1,1,157,0,0.5,0
89,55,0,0,140,229,0,0,110,1,0.5,0
777,55,0,1,130,262,0,0,155,0,0.0,1
239,48,0,0,160,193,0,0,102,1,3.0,0
889,59,0,3,134,204,0,0,162,0,0.8,1


In [313]:
# Standardize the data - VS
scaler = StandardScaler()

In [314]:
#VS
scaler.fit(X_train)

StandardScaler()

## Implementing the KNN Model

In [315]:
# Transform X_train and X_test. - VS
# Note that the scaler used to transform X_train and X_test was trained on X_train. - VS
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [316]:
# Instantiate KNN model and make predictions - VS
#N = 3, 6, 7, 8 return highest accuracy
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train_scaled, y_train)
y_pred = KNN.predict(X_test_scaled)

In [317]:
# Assess the accuracy score - VS
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
KNN_Score = accuracy_score(y_pred, y_test)
print('K-Nearest Neighbour Accuracy:'+' {:.2f}%'.format(KNN_Score*100))

K-Nearest Neighbour Accuracy: 88.04%


In [318]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)
print( "Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test, y_pred)
print("Accuracy:",result2)

Confusion Matrix:
[[69 19]
 [ 3 93]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.78      0.86        88
           1       0.83      0.97      0.89        96

    accuracy                           0.88       184
   macro avg       0.89      0.88      0.88       184
weighted avg       0.89      0.88      0.88       184

Accuracy: 0.8804347826086957
