In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm

In [2]:
dt = pd.read_csv("heart.csv")
dt.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
dt.shape

(918, 12)

In [5]:
dt.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


Removing Outliers using Z Score

RestingBP
Cholesterol
MaxHR
Oldpeak

z score of value x = (x - mean)/std

In [6]:
dt["RestingBP_ZScore"] = (dt["RestingBP"]-dt["RestingBP"].mean())/dt["RestingBP"].std()
dt = dt[(dt["RestingBP_ZScore"] < 3) & (dt["RestingBP_ZScore"] > -3)]

dt["Cholesterol_ZScore"] = (dt["Cholesterol"]-dt["Cholesterol"].mean())/dt["Cholesterol"].std()
dt = dt[(dt["Cholesterol_ZScore"] < 3) & (dt["Cholesterol_ZScore"] > -3)]

dt["MaxHR_ZScore"] = (dt["MaxHR"]-dt["MaxHR"].mean())/dt["MaxHR"].std()
dt = dt[(dt["MaxHR_ZScore"] < 3) & (dt["MaxHR_ZScore"] > -3)]

dt["Oldpeak_ZScore"] = (dt["Oldpeak"]-dt["Oldpeak"].mean())/dt["Oldpeak"].std()
dt = dt[(dt["Oldpeak_ZScore"] < 3) & (dt["Oldpeak_ZScore"] > -3)]

In [7]:
dt.shape

(899, 16)

In [8]:
columns = ["RestingBP_ZScore", 'Cholesterol_ZScore', 'MaxHR_ZScore', 'Oldpeak_ZScore']
dt.drop(columns, inplace=True, axis=1)
dtc = dt.copy()
dtc.head(7)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0


Converting Text Columns

ChestPainType - OneHot

RestingECG - Label

ExerciseAngina - Label

ST_Slope - Label

In [9]:
print(dtc["ChestPainType"].unique())
print(dtc["RestingECG"].unique())
print(dtc["ExerciseAngina"].unique())
print(dtc["ST_Slope"].unique())

['ATA' 'NAP' 'ASY' 'TA']
['Normal' 'ST' 'LVH']
['N' 'Y']
['Up' 'Flat' 'Down']


In [10]:
dtco = pd.get_dummies(dtc, columns = ['ChestPainType', 'Sex'], drop_first = True) 
dtco

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Sex_M
0,40,140,289,0,Normal,172,N,0.0,Up,0,True,False,False,True
1,49,160,180,0,Normal,156,N,1.0,Flat,1,False,True,False,False
2,37,130,283,0,ST,98,N,0.0,Up,0,True,False,False,True
3,48,138,214,0,Normal,108,Y,1.5,Flat,1,False,False,False,False
4,54,150,195,0,Normal,122,N,0.0,Up,0,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,Normal,132,N,1.2,Flat,1,False,False,True,True
914,68,144,193,1,Normal,141,N,3.4,Flat,1,False,False,False,True
915,57,130,131,0,Normal,115,Y,1.2,Flat,1,False,False,False,True
916,57,130,236,0,LVH,174,N,0.0,Flat,1,True,False,False,False


In [11]:
dtco["RestingECG"].replace({'Normal': 1, 'ST': 2, 'LVH': 3}, inplace=True)
dtco["RestingECG"].unique()

array([1, 2, 3], dtype=int64)

In [12]:
dtco["ExerciseAngina"].replace({'Y': 1, 'N': 0}, inplace=True)
dtco["ExerciseAngina"].unique()

array([0, 1], dtype=int64)

In [13]:
dtco["ST_Slope"].replace({'Down': 1, 'Flat': 2, 'Up': 3}, inplace=True)
dtco["ST_Slope"].unique()

array([3, 2, 1], dtype=int64)

In [14]:
x = dtco.drop("HeartDisease", axis='columns')

y = dtco.HeartDisease

In [15]:
scaler = StandardScaler()
model = scaler.fit(x)
sx = model.transform(x)

sx

array([[-1.42815446,  0.46590022,  0.84963584, ..., -0.5349047 ,
        -0.22955001,  0.515943  ],
       [-0.47585532,  1.63471366, -0.16812204, ...,  1.86949191,
        -0.22955001, -1.93819859],
       [-1.7455875 , -0.1185065 ,  0.79361247, ..., -0.5349047 ,
        -0.22955001,  0.515943  ],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ..., -0.5349047 ,
        -0.22955001,  0.515943  ],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ..., -0.5349047 ,
        -0.22955001, -1.93819859],
       [-1.63977649,  0.34901888, -0.21480818, ...,  1.86949191,
        -0.22955001,  0.515943  ]])

In [16]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(sx, y, test_size=0.3,random_state=109)

In [17]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear')
#Train the model using the training sets
svmodel = clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
#Accuracy
print("Accuracy of This Model:", svmodel.score(X_test, y_test))

Accuracy of This Model: 0.8777777777777778


In [18]:
# new = pd.DataFrame(y_pred, y_test)
# new

In [19]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
# metrics are used to find accuracy or error
from sklearn import metrics 
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL:", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL: 0.8740740740740741


In [20]:
# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 86.30%


Principal Component Analysis

In [21]:
#pca = PCA(n_components = 10)
pca  = PCA(0.85)

# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)

# explained_variance =  pca.explained_variance_ratio_
# print(explained_variance)

In [22]:
x_pca = pca.fit_transform(sx)

In [23]:
# Split dataset into training set and test set
X_train_pca, X_test_pca, y_train, y_test = train_test_split(x_pca, y, test_size=0.3,random_state=109)

In [24]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear')
#Train the model using the training sets
svmodel = clf.fit(X_train_pca, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test_pca)
#Accuracy
print("Accuracy of This Model:", svmodel.score(X_test_pca, y_test))

Accuracy of This Model: 0.8518518518518519


In [25]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train_pca, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test_pca)
# metrics are used to find accuracy or error
from sklearn import metrics 
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL:", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL: 0.8333333333333334


In [26]:
# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_pca, y_train)
# Evaluate the model
y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 85.19%
