# Heart Failure Prediction Project

# Project Title : Machine Learning based Heart Failure Prediction using Principle Component Analysis and Gaussian Naive Bayes

# Group Members: Arif Ali and Mujahid Khan

To know more about the problem, data and the solutions, readers can refer to the following research papers that recenlty got published.

Paper No 1: L. Ali and S. Bukhari, “An approach based on mutually informed neural networks to optimize the generalization capabilities of decision support systems developed for heart failure prediction,” IRBM, 2020. [Online]. Available: http://www.sciencedirect.com/science/article/pii/S1959031820300828

Paper No 2: L. Ali, A. Niamat, J. A. Khan, N. A. Golilarz, X. Xingzhong, A. Noor, R. Nour, and S. A. C. Bukhari, “An optimized stacked support vector machines based expert system for the effective prediction of heart failure,” IEEE Access, vol. 7, pp. 54 007–54 014, 2019.

Paper No 3: L. Ali, A. Rahman, A. Khan, M. Zhou, A. Javeed, and J. A. Khan, “An automated diagnostic system for heart disease prediction based on Chi2 statistical model and optimally configured deep neural network,” IEEE Access, vol. 7, pp. 34 938–34 945, 2019.


Problem Statement:  Heart is a vital organ of human body and is responsible for pumping blood to other organs of the body. Heart failure (HF) is a serious disorder with high prevalence. HF is prevalent in developed countries at a rate of approximately 2% in the adult population and about 8% in older subjects. Moreover, literature shows that about 3-5% of hospitals admissions have connection with HF incidents. Moreover, HF diagnosis is very costly owing to the fact that in developed countries HF accounts for 2\% of the total health costs. Hence, development of non-invasive methods for HF detection based on machine learning and data mining will help improve quality of life and reduce the associated medical costs.

In [15]:

import numpy
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import matthews_corrcoef
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA


global Best_Acc
Best_Acc=0
import time
numpy.random.seed(7)

In [16]:

import pandas as pd
from io import StringIO
df = pd.read_csv("D:\/Processed_Clevland_Dataset_FromNet.txt")

#I Checked it. It is correct dataset
X = numpy.array(df)
print(X.shape)

(303, 14)


In [17]:
df_new = df.drop([87, 166, 192, 266, 287, 302], 0)
print(df_new.shape)

df_X= df_new.drop('label',1)
Y_df = df_new.iloc[:,13]
df_numpy = numpy.array(df_new)

X = numpy.delete(df_numpy, 13, 1)
print("Feature Vector Size=", X.shape)
Y = numpy.array(Y_df)
print(Y)
print(df.columns)
print(Y.sum())

(297, 14)
Feature Vector Size= (297, 13)
[0 2 1 0 0 0 3 0 2 1 0 0 2 0 0 0 1 0 0 0 0 0 1 3 4 0 0 0 0 3 0 2 1 0 0 0 3
 1 3 0 4 0 0 0 1 4 0 4 0 0 0 0 2 0 1 1 1 1 0 0 2 0 1 0 2 2 1 0 2 1 0 3 1 1
 1 0 1 0 0 3 0 0 0 3 0 0 0 0 0 0 3 0 0 0 1 2 3 0 0 0 0 0 0 3 0 2 1 2 3 1 1
 0 2 2 0 0 0 3 2 3 4 0 3 1 0 3 3 0 0 0 0 0 0 0 0 4 3 1 0 0 1 0 1 0 1 4 0 0
 0 0 0 0 4 3 1 1 1 2 0 0 4 0 0 0 0 0 1 0 3 0 1 0 4 1 0 1 0 0 3 2 0 0 1 0 0
 2 1 2 0 3 2 0 3 0 0 0 1 0 0 0 0 0 3 3 3 0 1 0 4 0 3 1 0 0 0 0 0 0 0 0 3 1
 0 0 0 3 2 0 2 1 0 0 3 2 1 0 0 0 0 0 2 0 2 2 1 3 0 0 1 0 0 0 0 0 0 0 1 0 3
 0 0 4 2 2 1 0 1 0 2 0 1 0 0 0 1 0 2 0 3 0 2 4 2 0 0 1 0 2 2 1 0 3 1 1 2 3
 1]
Index(['age', 'sex', 'chestpain', 'BP', 'SerumCholestrol', 'FBS',
       'RestingElectrocardiography', 'HeartRate', 'EIAngina', 'OldPeak',
       'SlopofST', 'flouroscorpy', 'thal', 'label'],
      dtype='object')
281


In [18]:
i = 0
while i<=296:
    if Y[i]!=0:
        Y[i]=1
        #print(i)
    i=i+1

In [19]:
def evaluation_GNB(X_FS, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X_FS, Y, test_size=0.3, random_state=42)
    global Best_Acc
    Best_Sen = 0
    Best_Spec = 0
    
    TP = 0 
    TP_train=0
    FN=0   
    FN_train=0
    FP=0   
    FP_train=0
    TN = 0 
    TN_train=0
    model =  GaussianNB()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    Acc = accuracy_score(Y_test, Y_pred)*100

    ########For Sensitvity and specificity#########
    k=0
    while k<len(Y_pred):
        if Y_test[k]==Y_pred[k]==1:
            TP = TP+1
        if Y_test[k]==Y_pred[k]==0:
            TN = TN+1
        k=k+1
    FN = 41-TP
    FP = 49-TN
    Sensitivity = (TP/(TP+FN))*100
    Specificity = (TN/(TN+FP))*100
    if Acc>=Best_Acc:
        Best_Acc = Acc
        print("Best Test Acc ===========================================================", Acc)
        print("X_FS Size=", X_train.shape)
        print("Sensitivity =", Sensitivity)
        print("Specificity =, ", Specificity)
        print("MCC =====", matthews_corrcoef(Y_test, Y_pred))
        Y_pred = model.predict(X_train)
        Acc_train = accuracy_score(Y_train, Y_pred)*100
        print("Train Accuracy ===============================================", Acc_train)

        print("TP = ", TP)
        print("TN = ", TN)

In [20]:
from sklearn.feature_selection import SelectFromModel
Best_Acc=0
N = range(1, 14)
for n in N:
    pca = PCA(n_components=n, svd_solver='full')
    X_new = pca.fit_transform(X, Y)
    print("New Feature Vector Size  =", X_new.shape)
    #evaluation_RBFSVM(X_FS, Y)
    evaluation_GNB(X_new, Y)

New Feature Vector Size  = (297, 1)
X_FS Size= (207, 1)
Sensitivity = 26.82926829268293
Specificity =,  75.51020408163265
MCC ===== 0.026712010096692274
TP =  11
TN =  37
New Feature Vector Size  = (297, 2)
X_FS Size= (207, 2)
Sensitivity = 56.09756097560976
Specificity =,  87.75510204081633
MCC ===== 0.46732823779423044
TP =  23
TN =  43
New Feature Vector Size  = (297, 3)
X_FS Size= (207, 3)
Sensitivity = 60.97560975609756
Specificity =,  87.75510204081633
MCC ===== 0.5107235342397549
TP =  25
TN =  43
New Feature Vector Size  = (297, 4)
New Feature Vector Size  = (297, 5)
X_FS Size= (207, 5)
Sensitivity = 80.48780487804879
Specificity =,  85.71428571428571
MCC ===== 0.6635087809810993
TP =  33
TN =  42
New Feature Vector Size  = (297, 6)
X_FS Size= (207, 6)
Sensitivity = 85.36585365853658
Specificity =,  85.71428571428571
MCC ===== 0.7095662904602299
TP =  35
TN =  42
New Feature Vector Size  = (297, 7)
X_FS Size= (207, 7)
Sensitivity = 85.36585365853658
Specificity =,  85.714285714

# Building the Model with optimal components 

In [21]:
pca = PCA(n_components=12, svd_solver='full')
X_FS = pca.fit_transform(X, Y)
print("New Feature Vector Size  =", X_new.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X_FS, Y, test_size=0.3, random_state=42)
global Best_Acc
Best_Sen = 0
Best_Spec = 0

TP = 0 
TP_train=0
FN=0   
FN_train=0
FP=0   
FP_train=0
TN = 0 
TN_train=0
model =  GaussianNB()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
Acc = accuracy_score(Y_test, Y_pred)*100

########For Sensitvity and specificity#########
k=0
while k<len(Y_pred):
    if Y_test[k]==Y_pred[k]==1:
        TP = TP+1
    if Y_test[k]==Y_pred[k]==0:
        TN = TN+1
    k=k+1
FN = 41-TP
FP = 49-TN
Sensitivity = (TP/(TP+FN))*100
Specificity = (TN/(TN+FP))*100
if Acc>=Best_Acc:
    Best_Acc = Acc
    print("Best Testing Acc ===========================================================", Acc)
    print("X_FS Size=", X_train.shape)
    print("Sensitivity =", Sensitivity)
    print("Specificity =, ", Specificity)
    print("MCC =====", matthews_corrcoef(Y_test, Y_pred))
    Y_pred = model.predict(X_train)
    Acc_train = accuracy_score(Y_train, Y_pred)*100
    print("Train Accuracy ===============================================", Acc_train)

    print("TP = ", TP)
    print("TN = ", TN)

New Feature Vector Size  = (297, 13)
X_FS Size= (207, 12)
Sensitivity = 82.92682926829268
Specificity =,  93.87755102040816
MCC ===== 0.7773867978556046
TP =  34
TN =  46
