In [100]:
# import required libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [22]:
# load and read data
dataset = pd.read_csv('heart.csv')
dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [23]:
# shape of dataset
dataset.shape

(918, 12)

In [3]:
# # find outliers in Age column
# def outliers(x):
#     age_outliers = []
#     # first find min and max values in the column
#     min = np.min(x)
#     max = np.max(x)
    
#     # find q1, q2, q3
#     q1 = np.percentile(x, 25)
#     q2 = np.percentile(x, 50)
#     q3 = np.percentile(x, 75)
    
#     # calculate interquartile range
#     iqr = q3 - q1
    
#     lower_bound = q1 - 1.5*iqr
#     upper_bound = q3 + 1.5*iqr
    
#     for i in x:
#         if ((i < lower_bound) or (i > upper_bound)):
#             age_outliers.append(i)
    
#     if (len(age_outliers) != 0):
#         print("Outlier are: ", age_outliers)
#     else:
#         print("There are no outliers!")

In [30]:
def outliers(x):
    # Calculate quartiles and IQR
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    iqr = q3 - q1

    # Calculate lower and upper bounds for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Create a mask for outliers
    mask = (x < lower_bound) | (x > upper_bound)

    return mask

In [31]:
# now call outliers function to detect any outliers in Age column
age_outliers = outliers(dataset['Age'])
age_outliers

0      False
1      False
2      False
3      False
4      False
       ...  
913    False
914    False
915    False
916    False
917    False
Name: Age, Length: 918, dtype: bool

In [35]:
# check any outliers in RestingBP
rest_outliers = outliers(dataset['RestingBP'])

resting = dataset[~rest_outliers]
resting

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [36]:
# check outliers cholestrol column
cholestrol = outliers(resting['Cholesterol'])

choles = resting[~cholestrol]
choles

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [37]:
# find outliers in FastingBS
fasting = outliers(choles['FastingBS'])

fast = choles[~fasting]
fast

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
912,57,F,ASY,140,241,0,Normal,123,Y,0.2,Flat,1
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [38]:
# find outliers in MaxHr
maxhr = outliers(fast['MaxHR'])

max = fast[~maxhr]

max

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
912,57,F,ASY,140,241,0,Normal,123,Y,0.2,Flat,1
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [41]:
# outliers in Oldpeak
oldpeak = outliers(max['Oldpeak'])

old = max[~oldpeak]

old

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
912,57,F,ASY,140,241,0,Normal,123,Y,0.2,Flat,1
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [78]:
final_dataset = old.copy()
final_dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [79]:
# find null values in final_dataset
final_dataset.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [80]:
# find dtypes in dataset
dataset.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [81]:
# find unique values in Sex
final_dataset.Sex.value_counts()

Sex
M    435
F    152
Name: count, dtype: int64

In [82]:
# encode sex column
encoder = LabelEncoder()

final_dataset['Sex'] = encoder.fit_transform(final_dataset['Sex'])
final_dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [83]:
# find unique values in ChestPainType
final_dataset.ChestPainType.value_counts()

ChestPainType
ASY    277
ATA    144
NAP    137
TA      29
Name: count, dtype: int64

In [84]:
# Apply one hot encoding for ChestPainType
final_dataset = pd.get_dummies(final_dataset, columns = ['ChestPainType'], dtype = int)
final_dataset.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,1,140,289,0,Normal,172,N,0.0,Up,0,0,1,0,0
1,49,0,160,180,0,Normal,156,N,1.0,Flat,1,0,0,1,0
2,37,1,130,283,0,ST,98,N,0.0,Up,0,0,1,0,0
3,48,0,138,214,0,Normal,108,Y,1.5,Flat,1,1,0,0,0
4,54,1,150,195,0,Normal,122,N,0.0,Up,0,0,0,1,0


In [85]:
# unique values in resting ecg
final_dataset.RestingECG.value_counts()

RestingECG
Normal    374
LVH       131
ST         82
Name: count, dtype: int64

In [86]:
# get dummies for RestingECG
final_dataset = pd.get_dummies(final_dataset, columns = ['RestingECG'], dtype = int)
final_dataset.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,1,140,289,0,172,N,0.0,Up,0,0,1,0,0,0,1,0
1,49,0,160,180,0,156,N,1.0,Flat,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,N,0.0,Up,0,0,1,0,0,0,0,1
3,48,0,138,214,0,108,Y,1.5,Flat,1,1,0,0,0,0,1,0
4,54,1,150,195,0,122,N,0.0,Up,0,0,0,1,0,0,1,0


In [87]:
# unique values in ExerciseAngina
final_dataset.ExerciseAngina.value_counts()

ExerciseAngina
N    381
Y    206
Name: count, dtype: int64

In [88]:
# Encode ExerciseAngina
final_dataset['ExerciseAngina'] = encoder.fit_transform(final_dataset['ExerciseAngina'])
final_dataset.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,1,140,289,0,172,0,0.0,Up,0,0,1,0,0,0,1,0
1,49,0,160,180,0,156,0,1.0,Flat,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,Up,0,0,1,0,0,0,0,1
3,48,0,138,214,0,108,1,1.5,Flat,1,1,0,0,0,0,1,0
4,54,1,150,195,0,122,0,0.0,Up,0,0,0,1,0,0,1,0


In [89]:
# unique values in St Slope
final_dataset.ST_Slope.value_counts()

ST_Slope
Up      302
Flat    262
Down     23
Name: count, dtype: int64

In [90]:
# get dummies for ST_Slope
final_dataset = pd.get_dummies(final_dataset, columns = ['ST_Slope'], dtype = int)
final_dataset.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1


In [92]:
# split data into dependent and independent sets
x = final_dataset.drop(columns = ['HeartDisease'], axis = 1)
y = final_dataset['HeartDisease']
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [93]:
# split dataset into training and testing
train_size = int(len(final_dataset)*0.85)

# data for x
x_train = x.iloc[ : train_size, : ]
x_test = x.iloc[train_size: , : ]

# data for y
y_train = y[ : train_size]
y_test = y[train_size : ]


print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(498, 18)
(498,)
(89, 18)
(89,)


In [98]:
# apply logistic Regression
lr = LogisticRegression(max_iter = 5000)

lr.fit(x_train, y_train)

In [99]:
y_pred = lr.predict(x_test)
y_pred

array([0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0], dtype=int64)

In [101]:
# find accuracy
accuracy_score(y_test, y_pred)

0.8089887640449438

In [None]:
# now lets apply PCA