In [65]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [66]:
df = pd.read_csv('heart_disease.csv')

In [67]:
df.isnull().sum()

age          0
sex          0
cp          10
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
target       0
dtype: int64

##### As the coulmn cp (chest pain) has missing values, we need to impute the data. 

###### The data is numeric and hence mean stratergy will be a suitable choice.

In [68]:

from sklearn.impute import SimpleImputer
import numpy as np

# Create the imputer object with the desired strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit and transform the data
df_imputed = imputer.fit_transform(df)

In [69]:
df = pd.DataFrame(df_imputed)

In [70]:
df=df.rename(columns={0: 'age', 1:'sex', 2:'cp', 3:'trestbps',4: 'chol',5: 'fbs',6: 'restecg',7: 'thalach',8: 'exang',9: 'oldpeak',10: 'slope',11: 'ca',12: 'thal',13:'target'})

In [71]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [72]:
# We need to find the most important features in the dataset

In [73]:
model= RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(x,y)
pd.Series(model.feature_importances_,index=x.columns).sort_values(ascending=False)

cp          0.138583
thalach     0.122994
ca          0.107847
thal        0.106682
oldpeak     0.106565
age         0.092637
chol        0.078386
trestbps    0.072970
exang       0.057930
slope       0.055453
sex         0.031262
restecg     0.020414
fbs         0.008276
dtype: float64

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [75]:
# For 'cp' column, it records chest pain type. Number 3 mean no chest pain, number 0-2 means different tyoe of angina. 

# To simplify it, I will group the number0-2 together as disease positive, number 3 as disease negative

In [76]:
number=[0,1,2]
for col in df.itertuples():
    if col.cp in number:
        df['cp'].replace(to_replace=col.cp, value=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cp'].replace(to_replace=col.cp, value=1, inplace=True)


## Testing the accuracy when the top 8 features are used for fitting

In [77]:
df_top8 = df.loc[:,['cp','oldpeak','thal','ca','thalach','age','chol','trestbps','exang']]
x_train,x_test,y_train,y_test = train_test_split(df_top8,y,test_size=0.25,random_state=42)

In [78]:

clf = RandomForestClassifier(random_state=42)
clf.fit(x_train,y_train)
prediction = clf.predict(x_test)
accuracy = accuracy_score(prediction,y_test)
cm = confusion_matrix(prediction,y_test)
prfs = precision_recall_fscore_support(prediction,y_test)
print('Accuracy: ',accuracy)
print('\n')
print('Confusion Matrix: \n',cm)
print('\n')
print('Precision: ', prfs[0])
print('Recall:    ', prfs[1])
print('Fscore:    ', prfs[2])
print('Support:   ', prfs[3])

Accuracy:  0.7631578947368421


Confusion Matrix: 
 [[27 10]
 [ 8 31]]


Precision:  [0.77142857 0.75609756]
Recall:     [0.72972973 0.79487179]
Fscore:     [0.75  0.775]
Support:    [37 39]


In [79]:
# Took 40 secs
maxim = 0
n_estimators=0
max_depth=0
max_cm=0
max_prfs=0
max_features=0
for i in range(5,15):
    for j in range(5,15):
        for k in range(5,13):
            x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)
            clf = RandomForestClassifier(n_estimators=i,max_depth=j,max_features=k)
            clf.fit(x_train,y_train)
            prediction = clf.predict(x_test)
            accuracy = accuracy_score(prediction,y_test)
            cm = confusion_matrix(prediction,y_test)
            prfs = precision_recall_fscore_support(prediction,y_test)
            if accuracy > maxim:
                maxim = accuracy
                n_estimators = i
                max_depth = j
                max_features = k
                max_cm = cm
                max_prfs=prfs
                
print(str(i)+" "+str(j)+" "+str(k)+" "+str(maxim))
print('\n')
print('Confusion Matrix: ',cm)
print('\n')
print('Precision: ', prfs[0])
print('Recall:    ', prfs[1])
print('Fscore:    ', prfs[2])
print('Support:   ', prfs[3])

14 14 12 0.868421052631579


Confusion Matrix:  [[27 10]
 [ 8 31]]


Precision:  [0.77142857 0.75609756]
Recall:     [0.72972973 0.79487179]
Fscore:     [0.75  0.775]
Support:    [37 39]


### Let's test if standardization can improve the accuracy

In [61]:
from sklearn.preprocessing import StandardScaler

In [62]:
x = df.iloc[:,:-1]
x_std = StandardScaler().fit_transform(x)

In [64]:
maxim = 0
n_estimators=0
max_depth=0
max_features=0
for i in range(5,15):
    for j in range(5,15):
        for k in range(5,13):
            x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)
            clf = RandomForestClassifier(n_estimators=i,max_depth=j,max_features=k)
            clf.fit(x_train,y_train)
            prediction = clf.predict(x_test)
            accuracy = accuracy_score(prediction,y_test)
            if accuracy > maxim:
                maxim = accuracy
                n_estimators = i
                max_depth = j
                max_features = k
print(str(i)+" "+str(j)+" "+str(k)+" "+str(maxim))

14 14 12 0.8552631578947368


In [22]:
# Not that much difference with standardiaztion


### Random forest classifier helped in achieving accuracy upto 85.5 % which is very good cosidering the size and quality of data