# **DATA PREPROCESSING**

In [164]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [165]:
data=pd.read_csv('heart_failure_clinical_records_dataset.csv')
data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [166]:
data.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [167]:
data.drop(columns='time',axis=1,inplace=True)
data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,0


In [130]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(9)
memory usage: 28.2 KB


In [131]:
data.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
DEATH_EVENT                   int64
dtype: object

## Prepare train and test dataset

In [132]:
#creating our X feature variables and y outcome variable
X = data.drop('DEATH_EVENT',axis=1)
y = data['DEATH_EVENT']

In [133]:
#splitting our data into training and testing data set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

# **Random Forest Classification**

In [134]:
#Random Forest Classification
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier()
clf.fit(X_train,y_train)


RandomForestClassifier()

In [135]:
#To predict occupancy using the test's set features
clf_pred=clf.predict(X_test)
print(clf_pred)

[0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0
 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0]


In [136]:
# Transfer the array of predicted popularity of test data into dataframe
clf_pred_df = pd.DataFrame(data=clf_pred, columns=['DEATH_EVENT'])
clf_pred_df.head(10)

Unnamed: 0,DEATH_EVENT
0,0
1,0
2,1
3,0
4,1
5,1
6,1
7,0
8,0
9,0


In [137]:
RFprob_each = clf.predict_proba(X_test)
RFprob_each

array([[0.98, 0.02],
       [0.69, 0.31],
       [0.38, 0.62],
       [0.77, 0.23],
       [0.32, 0.68],
       [0.19, 0.81],
       [0.38, 0.62],
       [0.56, 0.44],
       [0.94, 0.06],
       [0.9 , 0.1 ],
       [0.95, 0.05],
       [0.87, 0.13],
       [0.55, 0.45],
       [0.47, 0.53],
       [0.97, 0.03],
       [0.71, 0.29],
       [0.94, 0.06],
       [0.8 , 0.2 ],
       [0.93, 0.07],
       [0.86, 0.14],
       [0.28, 0.72],
       [0.49, 0.51],
       [0.39, 0.61],
       [0.84, 0.16],
       [0.51, 0.49],
       [0.71, 0.29],
       [0.77, 0.23],
       [0.8 , 0.2 ],
       [0.75, 0.25],
       [0.88, 0.12],
       [0.78, 0.22],
       [0.46, 0.54],
       [0.78, 0.22],
       [0.69, 0.31],
       [0.46, 0.54],
       [0.9 , 0.1 ],
       [0.58, 0.42],
       [0.68, 0.32],
       [0.16, 0.84],
       [0.88, 0.12],
       [0.5 , 0.5 ],
       [0.49, 0.51],
       [0.76, 0.24],
       [0.97, 0.03],
       [0.82, 0.18],
       [0.33, 0.67],
       [0.88, 0.12],
       [0.97,

In [138]:
pos_prob = []
for inner in RFprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.02, 0.31, 0.62, 0.23, 0.68, 0.81, 0.62, 0.44, 0.06, 0.1]

In [139]:
X_testdf = X_test.copy()
X_testdf.insert(len(X_testdf.columns),'y_test DEATH_EVENT',y_test)
X_testdf=X_testdf.reset_index(drop=True)
X_testdf.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,y_test DEATH_EVENT
0,55.0,0,835,0,40,0,279000.0,0.7,140,1,1,0
1,65.0,0,118,0,50,0,194000.0,1.1,145,1,1,0
2,58.0,1,57,0,25,0,189000.0,1.3,132,1,1,0
3,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,1
4,50.0,1,115,0,20,0,189000.0,0.8,139,1,0,0


In [140]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'RForest', clf_pred_df['DEATH_EVENT'])
X_testdf

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,y_test DEATH_EVENT,RForest
0,55.0,0,835,0,40,0,279000.0,0.7,140,1,1,0,0
1,65.0,0,118,0,50,0,194000.0,1.1,145,1,1,0,0
2,58.0,1,57,0,25,0,189000.0,1.3,132,1,1,0,1
3,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,1,0
4,50.0,1,115,0,20,0,189000.0,0.8,139,1,0,0,1
5,94.0,0,582,1,38,1,263358.03,1.83,134,1,0,1,1
6,75.0,0,99,0,38,1,224000.0,2.5,134,1,0,1,1
7,59.0,1,176,1,25,0,221000.0,1.0,136,1,1,1,0
8,51.0,0,582,1,40,0,221000.0,0.9,134,0,0,0,0
9,46.0,1,291,0,35,0,348000.0,0.9,140,0,0,0,0


# **Naive Bayes**

In [141]:
from sklearn.naive_bayes import GaussianNB

gaus = GaussianNB()
gaus.fit(X_train, y_train)

GaussianNB()

In [142]:
# To predict popularity using test set's features
NB_pred= gaus.predict(X_test)
print(NB_pred)

[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0]


In [143]:
# Transfer the array of predicted popularity of test data into dataframe
NB_pred_df = pd.DataFrame(data=NB_pred, columns=['DEATH_EVENT'])
NB_pred_df.head(10)

Unnamed: 0,DEATH_EVENT
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,0
8,0
9,0


In [144]:
NBprob_each = gaus.predict_proba(X_test)
NBprob_each

array([[9.27141636e-01, 7.28583640e-02],
       [9.01830461e-01, 9.81695389e-02],
       [7.31369057e-01, 2.68630943e-01],
       [9.01784120e-01, 9.82158804e-02],
       [7.97905132e-01, 2.02094868e-01],
       [3.65536070e-01, 6.34463930e-01],
       [7.28477473e-01, 2.71522527e-01],
       [8.10483709e-01, 1.89516291e-01],
       [9.15827645e-01, 8.41723545e-02],
       [9.26717212e-01, 7.32827884e-02],
       [9.22203841e-01, 7.77961594e-02],
       [8.24910308e-01, 1.75089692e-01],
       [7.83675963e-01, 2.16324037e-01],
       [8.30045943e-01, 1.69954057e-01],
       [9.32207445e-01, 6.77925552e-02],
       [8.87674442e-01, 1.12325558e-01],
       [9.31028601e-01, 6.89713988e-02],
       [9.14738225e-01, 8.52617746e-02],
       [8.96560263e-01, 1.03439737e-01],
       [8.40444839e-01, 1.59555161e-01],
       [7.83038758e-01, 2.16961242e-01],
       [9.24713299e-01, 7.52867008e-02],
       [4.72623848e-01, 5.27376152e-01],
       [9.10942244e-01, 8.90577562e-02],
       [8.879178

In [145]:
pos_prob = []
for inner in NBprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.07285836397583285,
 0.09816953892342328,
 0.26863094303821294,
 0.09821588044980206,
 0.20209486775614802,
 0.6344639300646935,
 0.2715225265748137,
 0.1895162911543821,
 0.08417235452856925,
 0.07328278840983249]

In [146]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'NBGaus', NB_pred_df['DEATH_EVENT'])
X_testdf

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,y_test DEATH_EVENT,RForest,NBGaus
0,55.0,0,835,0,40,0,279000.0,0.7,140,1,1,0,0,0
1,65.0,0,118,0,50,0,194000.0,1.1,145,1,1,0,0,0
2,58.0,1,57,0,25,0,189000.0,1.3,132,1,1,0,1,0
3,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,1,0,0
4,50.0,1,115,0,20,0,189000.0,0.8,139,1,0,0,1,0
5,94.0,0,582,1,38,1,263358.03,1.83,134,1,0,1,1,1
6,75.0,0,99,0,38,1,224000.0,2.5,134,1,0,1,1,0
7,59.0,1,176,1,25,0,221000.0,1.0,136,1,1,1,0,0
8,51.0,0,582,1,40,0,221000.0,0.9,134,0,0,0,0,0
9,46.0,1,291,0,35,0,348000.0,0.9,140,0,0,0,0,0


# **KNN Classification**

In [147]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [148]:
# To predict popularity using test set's features
KNN_pred= knn.predict(X_test)
print(KNN_pred)

[0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0]


In [149]:
# Transfer the array of predicted popularity of test data into dataframe
KNpredct_df = pd.DataFrame(data=KNN_pred, columns=['DEATH_EVENT'])
KNpredct_df.head(10)

Unnamed: 0,DEATH_EVENT
0,0
1,1
2,1
3,0
4,1
5,1
6,0
7,0
8,0
9,0


In [150]:
KNprob_each = knn.predict_proba(X_test)
KNprob_each

array([[1.        , 0.        ],
       [0.14285714, 0.85714286],
       [0.42857143, 0.57142857],
       [0.71428571, 0.28571429],
       [0.42857143, 0.57142857],
       [0.42857143, 0.57142857],
       [0.57142857, 0.42857143],
       [0.85714286, 0.14285714],
       [0.85714286, 0.14285714],
       [0.57142857, 0.42857143],
       [0.85714286, 0.14285714],
       [0.57142857, 0.42857143],
       [0.85714286, 0.14285714],
       [0.57142857, 0.42857143],
       [0.71428571, 0.28571429],
       [0.57142857, 0.42857143],
       [0.71428571, 0.28571429],
       [0.71428571, 0.28571429],
       [0.85714286, 0.14285714],
       [0.57142857, 0.42857143],
       [0.57142857, 0.42857143],
       [0.71428571, 0.28571429],
       [0.28571429, 0.71428571],
       [0.71428571, 0.28571429],
       [1.        , 0.        ],
       [0.42857143, 0.57142857],
       [1.        , 0.        ],
       [0.57142857, 0.42857143],
       [0.71428571, 0.28571429],
       [0.71428571, 0.28571429],
       [0.

In [151]:
pos_prob = []
for inner in KNprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]

[0.0,
 0.8571428571428571,
 0.5714285714285714,
 0.2857142857142857,
 0.5714285714285714,
 0.5714285714285714,
 0.42857142857142855,
 0.14285714285714285,
 0.14285714285714285,
 0.42857142857142855]

In [152]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'KNeigh', KNpredct_df['DEATH_EVENT'])
X_testdf


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,y_test DEATH_EVENT,RForest,NBGaus,KNeigh
0,55.0,0,835,0,40,0,279000.0,0.7,140,1,1,0,0,0,0
1,65.0,0,118,0,50,0,194000.0,1.1,145,1,1,0,0,0,1
2,58.0,1,57,0,25,0,189000.0,1.3,132,1,1,0,1,0,1
3,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,1,0,0,0
4,50.0,1,115,0,20,0,189000.0,0.8,139,1,0,0,1,0,1
5,94.0,0,582,1,38,1,263358.03,1.83,134,1,0,1,1,1,1
6,75.0,0,99,0,38,1,224000.0,2.5,134,1,0,1,1,0,0
7,59.0,1,176,1,25,0,221000.0,1.0,136,1,1,1,0,0,0
8,51.0,0,582,1,40,0,221000.0,0.9,134,0,0,0,0,0,0
9,46.0,1,291,0,35,0,348000.0,0.9,140,0,0,0,0,0,0


# **Decision Tree**

In [153]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [154]:
#To predict occupancy using the test's set features
model_pred=model.predict(X_test)
print(model_pred)

[0 1 1 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1
 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1]


In [155]:
# Transfer the array of predicted popularity of test data into dataframe
model_pred_df = pd.DataFrame(data=model_pred, columns=['DEATH_EVENT'])
model_pred_df.head(10)

Unnamed: 0,DEATH_EVENT
0,0
1,1
2,1
3,0
4,1
5,0
6,1
7,1
8,0
9,1


In [156]:
DTprob_each = model.predict_proba(X_test)
DTprob_each

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.

In [157]:
pos_prob = []
for inner in DTprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]

In [158]:
# Insert result, check the prediction correctness
X_testdf.insert(len(X_testdf.columns), 'DecisionT', model_pred_df['DEATH_EVENT'])
X_testdf

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,y_test DEATH_EVENT,RForest,NBGaus,KNeigh,DecisionT
0,55.0,0,835,0,40,0,279000.0,0.7,140,1,1,0,0,0,0,0
1,65.0,0,118,0,50,0,194000.0,1.1,145,1,1,0,0,0,1,1
2,58.0,1,57,0,25,0,189000.0,1.3,132,1,1,0,1,0,1,1
3,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,1,0,0,0,0
4,50.0,1,115,0,20,0,189000.0,0.8,139,1,0,0,1,0,1,1
5,94.0,0,582,1,38,1,263358.03,1.83,134,1,0,1,1,1,1,0
6,75.0,0,99,0,38,1,224000.0,2.5,134,1,0,1,1,0,0,1
7,59.0,1,176,1,25,0,221000.0,1.0,136,1,1,1,0,0,0,1
8,51.0,0,582,1,40,0,221000.0,0.9,134,0,0,0,0,0,0,0
9,46.0,1,291,0,35,0,348000.0,0.9,140,0,0,0,0,0,0,1


In [159]:
data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,0


# **Evaluation of Classifiers**

In [160]:
#Import relevant metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [161]:
#RANDOM FOREST
acRF=accuracy_score(y_test,clf_pred)
fscoreRF=f1_score(y_test,clf_pred)
conmatrixRF = confusion_matrix(y_test,clf_pred)

#NAIVE BAYES
acNB=accuracy_score(y_test,NB_pred)
fscoreNB = f1_score(y_test,NB_pred)
conmatrixNB = confusion_matrix(y_test,NB_pred)

#K NEIGHBOURS
acKNN=accuracy_score(y_test,KNN_pred)
fscoreKNN=f1_score(y_test,KNN_pred)
conmatrixKNN = confusion_matrix(y_test,KNN_pred)

#DECISION TREE
acDTree = accuracy_score(y_test,model_pred)
fscoreDTree = f1_score(y_test,model_pred)
conmatrixDTree = confusion_matrix(y_test,model_pred)


In [162]:
df = pd.DataFrame(
    {
        "KNeigh": [acKNN, fscoreKNN, conmatrixKNN],
        "Naive Bayes": [acNB, fscoreNB, conmatrixNB],
        "Random Forest": [acRF, fscoreRF, conmatrixRF],
        "Decision Tree": [acDTree, fscoreDTree, conmatrixDTree]
    },
    columns=["KNeigh", "Naive Bayes", "Random Forest", "Decision Tree"],
    index = ["Accuracy", "Fscore", "Confusion Matrix"])
df



Unnamed: 0,KNeigh,Naive Bayes,Random Forest,Decision Tree
Accuracy,0.65,0.75,0.733333,0.65
Fscore,0.222222,0.4,0.529412,0.461538
Confusion Matrix,"[[36, 7], [14, 3]]","[[40, 3], [12, 5]]","[[35, 8], [8, 9]]","[[30, 13], [8, 9]]"
