In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries and Loading the Dataset

In [2]:
# Importing all necessary Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Loading the dataset
heart_df=pd.read_csv("/content/drive/My Drive/Akshata_Singh_project/heart.csv")
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [4]:
#Displaying 5 random samples
heart_df.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
905,67,M,NAP,152,212,0,LVH,150,N,0.8,Flat,1
675,57,M,NAP,150,126,1,Normal,173,N,0.2,Up,0
692,39,F,NAP,94,199,0,Normal,179,N,0.0,Up,0
891,66,F,NAP,146,278,0,LVH,152,N,0.0,Flat,0
360,62,M,NAP,160,0,0,Normal,72,Y,0.0,Flat,1


In [5]:
# Getting dataset summary
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [6]:
# Getting dataset statistics
heart_df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [7]:
heart_df.describe(include="all")

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


# Data Preprocessing

In [8]:
# Checking the null values
heart_df.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [9]:
# Checking for duplicates
heart_df.duplicated().sum()

np.int64(0)

In [10]:
# Checking number of unique values in each feature
heart_df.nunique()

Unnamed: 0,0
Age,50
Sex,2
ChestPainType,4
RestingBP,67
Cholesterol,222
FastingBS,2
RestingECG,3
MaxHR,119
ExerciseAngina,2
Oldpeak,53


In [11]:
cat_col=heart_df.select_dtypes(include='object').columns

In [12]:
heart_df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

# Converting Categorical Variables to Numeric



*   Sex : M=0, F=1
*   ChestPainType : ATA = 0, NAP = 1, ASY = 2, TA = 3

*   RestingECG : Normal=0, ST=1, LVH=2
*   ExerciseAngina : N=0, Y=1

*   ST_Slope : Up = 0, Flat = 1, Down = 2











In [13]:
for col in cat_col:
  print(col)
  print((heart_df[col].unique()), list(range(heart_df[col].nunique())))
  heart_df[col].replace((heart_df[col].unique()),range(heart_df[col].nunique()),inplace=True)
  print('*'*90)
  print()

Sex
['M' 'F'] [0, 1]
******************************************************************************************

ChestPainType
['ATA' 'NAP' 'ASY' 'TA'] [0, 1, 2, 3]
******************************************************************************************

RestingECG
['Normal' 'ST' 'LVH'] [0, 1, 2]
******************************************************************************************

ExerciseAngina
['N' 'Y'] [0, 1]
******************************************************************************************

ST_Slope
['Up' 'Flat' 'Down'] [0, 1, 2]
******************************************************************************************



In [14]:
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,0,3,110,264,0,0,132,0,1.2,1,1
914,68,0,2,144,193,1,0,141,0,3.4,1,1
915,57,0,2,130,131,0,0,115,1,1.2,1,1
916,57,1,0,130,236,0,2,174,0,0.0,1,1


In [15]:
heart_df['Cholesterol'].value_counts()

Unnamed: 0_level_0,count
Cholesterol,Unnamed: 1_level_1
0,172
254,11
220,10
223,10
204,9
...,...
353,1
278,1
157,1
176,1


Cholestrol value cannot be zero. It states that reading value was not properly noted.

# Imputing the 0 values in cholestrol column with KNN Imputer

In [16]:
np.nan

nan

In [17]:
heart_df['Cholesterol'].replace(0,np.nan,inplace=True)

In [18]:
from sklearn.impute import KNNImputer
imputer=KNNImputer(n_neighbors=3)
after_impute=imputer.fit_transform(heart_df)
heart_df=pd.DataFrame(after_impute,columns=heart_df.columns)

In [19]:
heart_df['Cholesterol'].isna().sum()

np.int64(0)

# Doing the same for Resting Blood Pressure

In [20]:
heart_df['RestingBP'][heart_df['RestingBP']==0]

Unnamed: 0,RestingBP
449,0.0


In [21]:
from sklearn.impute import KNNImputer
heart_df['RestingBP'].replace(0,np.nan,inplace=True)
imputer=KNNImputer(n_neighbors=3)
after_impute=imputer.fit_transform(heart_df)
heart_df=pd.DataFrame(after_impute,columns=heart_df.columns)

In [22]:
heart_df['RestingBP'].unique()

array([140., 160., 130., 138., 150., 120., 110., 136., 115., 100., 124.,
       113., 125., 145., 112., 132., 118., 170., 142., 190., 135., 180.,
       108., 155., 128., 106.,  92., 200., 122.,  98., 105., 133.,  95.,
        80., 137., 185., 165., 126., 152., 116., 144., 154., 134., 104.,
       139., 131., 141., 178., 146., 158., 123., 102.,  96., 143., 172.,
       156., 114., 127., 101., 174.,  94., 148., 117., 192., 129., 164.])

In [23]:
heart_df['RestingBP'].isnull().sum()

np.int64(0)

# Change columns type to int

In [24]:
withoutOldPeak=heart_df.columns
withoutOldPeak=withoutOldPeak.drop('Oldpeak')
heart_df[withoutOldPeak]=heart_df[withoutOldPeak].astype('int32')

In [25]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int32  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int32  
 4   Cholesterol     918 non-null    int32  
 5   FastingBS       918 non-null    int32  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int32  
 8   ExerciseAngina  918 non-null    int32  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    int32  
dtypes: float64(1), int32(11)
memory usage: 46.7 KB


In [26]:
heart_df.sample()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
215,30,1,3,170,237,0,1,170,0,0.0,0,0


In [27]:
heart_df.corr()['HeartDisease'][:-1].sort_values()

Unnamed: 0,HeartDisease
MaxHR,-0.400421
Sex,-0.305445
RestingECG,0.061011
Cholesterol,0.101263
RestingBP,0.118333
FastingBS,0.267291
Age,0.282039
Oldpeak,0.403951
ChestPainType,0.459017
ExerciseAngina,0.494282


In [28]:
!pip install plotly



# Data Visualisation

In [29]:
import plotly.express as px

In [30]:
px.line(heart_df.corr()['HeartDisease'][:-1].sort_values())

# Age and HeartDisease Distribution

In [31]:
px.sunburst(heart_df,path=['HeartDisease','Age'])

In [32]:
px.histogram(heart_df,x='Age',color='HeartDisease')

# Percentage of HeartDisease Data Distribution

In [33]:
px.pie(heart_df,names='HeartDisease',title='Percentage of HeartDisease classes distribution')

In [34]:
px.histogram(heart_df,x='Sex',color='HeartDisease')

# ChestPainType vs HeartDisease

In [35]:
px.histogram(heart_df,x='ChestPainType', color='HeartDisease')
#ChestPainType : ATA=0, NAP=1, ASV=2, TA=3

# RestingBP vs HeartDisease

In [36]:
heart_df['RestingBP'].unique()

array([140, 160, 130, 138, 150, 120, 110, 136, 115, 100, 124, 113, 125,
       145, 112, 132, 118, 170, 142, 190, 135, 180, 108, 155, 128, 106,
        92, 200, 122,  98, 105, 133,  95,  80, 137, 185, 165, 126, 152,
       116, 144, 154, 134, 104, 139, 131, 141, 178, 146, 158, 123, 102,
        96, 143, 172, 156, 114, 127, 101, 174,  94, 148, 117, 192, 129,
       164], dtype=int32)

In [37]:
px.sunburst(heart_df,path=['HeartDisease','RestingBP'])

# FastingBS vs HeartDisease

In [38]:
px.histogram(heart_df,x='FastingBS',color='HeartDisease')

# MaxHR vs HeartDisease

In [39]:
px.sunburst(heart_df,path=['HeartDisease','MaxHR'])

In [40]:
px.violin(heart_df,x='HeartDisease',y='MaxHR', color='HeartDisease')

# Oldpeak vs HeartDisease

In [41]:
px.violin(heart_df,x='HeartDisease',y='Oldpeak',color='HeartDisease')

# ST_Slope vs HeartDisease

In [42]:
px.histogram(heart_df,x='ST_Slope',color='HeartDisease')

# ExerciseAngina vs Heart Disease

In [43]:
px.histogram(heart_df,x='ExerciseAngina',color='HeartDisease')

# Train Test Split

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    heart_df.drop('HeartDisease',axis=1),
    heart_df['HeartDisease'],
    test_size=0.2,
    random_state=42,
    stratify=heart_df['HeartDisease']
)

# Logistic Regression

We are using Logistic Regression as it is used for binary classification (yes/no)

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
best_solver = ''
test_score = np.zeros(6)

for i, n in enumerate(solver):
    lr = LogisticRegression(solver=n).fit(X_train, y_train)
    test_score[i] = lr.score(X_test, y_test)
    if lr.score(X_test, y_test) == test_score.max():
        best_solver = n
print(best_solver)
lr = LogisticRegression(solver=best_solver)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print(f'LogisticRegression Score: {accuracy_score(y_test, lr_pred)}')


newton-cholesky
LogisticRegression Score: 0.8586956521739131


In [46]:
import pickle

# Save the trained model to a file
with open("logistic_model.pkl", "wb") as file:
    pickle.dump(lr, file)

# Support Vector Machine

In [47]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

kernels = {'linear': 0, 'poly': 0, 'rbf': 0, 'sigmoid': 0}
best = ''

for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(X_train, y_train)
    yhat = svm.predict(X_test)
    kernels[i] = f1_score(y_test, yhat, average="weighted")

    if kernels[i] == max(kernels.values()):
        best = i
print(best)
svm = SVC(kernel=best)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

print(f'SVM f1_score kernel({best}): {f1_score(y_test, svm_pred, average="weighted")}')


linear
SVM f1_score kernel(linear): 0.8422922535440344


In [48]:
import pickle

# Save the trained SVM model to a pickle file
with open("svm_model.pkl", "wb") as file1:
    pickle.dump(svm, file1)


# Decision Tree Classifier

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4],
    'random_state': [0, 42]
}

grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
Ctree = DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')
Ctree.fit(X_train, y_train)
dtc_pred = Ctree.predict(X_test)
print("DecisionTree's Accuracy: ", accuracy_score(y_test, dtc_pred))


DecisionTree's Accuracy:  0.8097826086956522


In [50]:
import pickle

# Save the trained Decision Tree model to a pickle file
with open("decision_tree.pkl", "wb") as file2:
    pickle.dump(Ctree, file2)

# Random Forest Classifier

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Initialize the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

# Define a smaller parameter grid to speed up execution
param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['sqrt'],
    'max_depth': [6, 9],
    'max_leaf_nodes': [6]
}

# Use GridSearchCV with parallel processing
grid_search = GridSearchCV(rfc, param_grid, n_jobs=-1, cv=3, verbose=0)

# Fit the model
grid_search.fit(X_train, y_train)

# Train the best model found
best_rfc = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_rfc.fit(X_train, y_train)

# Make predictions
rfc_pred = best_rfc.predict(X_test)

# Print Accuracy
print("RandomForestClassifier's Accuracy:", accuracy_score(y_test, rfc_pred))


RandomForestClassifier's Accuracy: 0.842391304347826


In [52]:
import pickle

# Save the trained Random Forest model to a pickle file
with open("random_forest.pkl", "wb") as file3:
    pickle.dump(best_rfc, file3)
