In [1]:
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [4]:
data=pd.read_csv("/content/drive/MyDrive/Heart_disease_prediction/heart.csv")

In [5]:
features=data.iloc[:,:-1]

In [6]:
features.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [7]:
labels=data.iloc[:,-1]

In [8]:
labels.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [9]:
#Check for missing value
print(features.isnull())

       Age    Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  \
0    False  False          False      False        False      False   
1    False  False          False      False        False      False   
2    False  False          False      False        False      False   
3    False  False          False      False        False      False   
4    False  False          False      False        False      False   
..     ...    ...            ...        ...          ...        ...   
913  False  False          False      False        False      False   
914  False  False          False      False        False      False   
915  False  False          False      False        False      False   
916  False  False          False      False        False      False   
917  False  False          False      False        False      False   

     RestingECG  MaxHR  ExerciseAngina  Oldpeak  ST_Slope  
0         False  False           False    False     False  
1         False  False     

In [10]:
features.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
count,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657
min,28.0,0.0,0.0,0.0,60.0,-2.6
25%,47.0,120.0,173.25,0.0,120.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6
75%,60.0,140.0,267.0,0.0,156.0,1.5
max,77.0,200.0,603.0,1.0,202.0,6.2


In [11]:
#Information about the data
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 79.0+ KB


Label Encoding

In [12]:
dum_data = pd.get_dummies(features, drop_first=True)

In [13]:
dum_data.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA',
       'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_Y',
       'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

In [14]:
dum_data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,1,0,1,0,1,0,0,0,1


In [15]:
#Normalizing features...

deviating_features=["Age",	"RestingBP",	"Cholesterol"	,"FastingBS",	"MaxHR" ,"Oldpeak"]

In [16]:
import numpy as np

def normalize_array(arr):
    """
    Normalize the values in the input array to the range [0, 1].

    Parameters:
    arr (numpy.ndarray): The input array to be normalized.

    Returns:
    numpy.ndarray: The normalized array.
    """
    min_val = np.min(arr)
    max_val = np.max(arr)

    if min_val == max_val:
        # Avoid division by zero if all values are the same.
        return arr
    else:
        normalized_arr = (arr - min_val) / (max_val - min_val)
        return normalized_arr




In [None]:
# for i in deviating_features:
#   X=dum_data[f"{i}"]

#   scaled_X= normalize_array(X)

#   dum_data[f"{i}"]=scaled_X



In [17]:
dum_data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,1,0,1,0,1,0,0,0,1


Splitting Dataset..(80%-20%)

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dum_data, labels,
                                                    train_size=0.8,
                                                    random_state=2021)

In [19]:
X_train.shape

(734, 15)

Using Classical Machine Learning Models..

Model-1(SVC)

In [20]:
from sklearn.svm import SVC


In [21]:
svc_model=SVC(C=10)

In [22]:
svc_model.fit(X_train,y_train)

In [23]:
pred_svc= svc_model.predict(X_test)

In [24]:
y_test

526    1
79     1
62     0
833    0
32     1
      ..
375    1
454    1
507    0
662    1
876    1
Name: HeartDisease, Length: 184, dtype: int64

In [25]:
pred_svc

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1])

In [26]:
# Report...
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [27]:

accuracy = accuracy_score(y_test, pred_svc)
print("Accuracy:", accuracy)


Accuracy: 0.7391304347826086


In [28]:
f1 = f1_score(y_test, pred_svc)
print("F1 Score:", f1)


F1 Score: 0.7669902912621359


In [29]:
classification_rep = classification_report(y_test, pred_svc)
print("Classification Report:\n", classification_rep)


Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.66      0.70        86
           1       0.73      0.81      0.77        98

    accuracy                           0.74       184
   macro avg       0.74      0.73      0.74       184
weighted avg       0.74      0.74      0.74       184



Model-2(Decision Tree)-

In [30]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
model_decisiontree=DecisionTreeClassifier()

In [32]:
model_decisiontree.fit(X_train,y_train)

In [33]:
pred_tree= model_decisiontree.predict(X_test)

In [34]:
# Report...

accuracy = accuracy_score(y_test, pred_tree)
print("Accuracy:", accuracy)

Accuracy: 0.7880434782608695


In [35]:
f1 = f1_score(y_test, pred_tree)
print("F1 Score:", f1)


F1 Score: 0.8059701492537313


In [36]:
classification_rep = classification_report(y_test, pred_tree)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.74      0.77        86
           1       0.79      0.83      0.81        98

    accuracy                           0.79       184
   macro avg       0.79      0.79      0.79       184
weighted avg       0.79      0.79      0.79       184



Model-3(Random forest)

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
model_randomforest=RandomForestClassifier()

In [39]:
model_randomforest.fit(X_train,y_train)

In [40]:
pred_forest=model_randomforest.predict(X_test)

In [41]:
# Report...

accuracy = accuracy_score(y_test, pred_forest)
print("Accuracy:", accuracy)

Accuracy: 0.8641304347826086


In [42]:
f1 = f1_score(y_test, pred_forest)
print("F1 Score:", f1)

F1 Score: 0.8756218905472637


In [43]:
classification_rep = classification_report(y_test, pred_forest)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85        86
           1       0.85      0.90      0.88        98

    accuracy                           0.86       184
   macro avg       0.87      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



Model-4(KNN)

In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
model_knn=KNeighborsClassifier(n_neighbors=9)

In [46]:
model_knn.fit(X_train,y_train)

In [47]:
pred_knn=model_knn.predict(X_test)

In [48]:
# Report...
classification_rep = classification_report(y_test, pred_knn)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.70      0.71        86
           1       0.74      0.77      0.75        98

    accuracy                           0.73       184
   macro avg       0.73      0.73      0.73       184
weighted avg       0.73      0.73      0.73       184



Model-5(Naive Bayes)

In [49]:
from sklearn.naive_bayes import GaussianNB

In [50]:
model_nb=GaussianNB()

In [51]:
model_nb.fit(X_train,y_train)

In [52]:
pred_nb=model_nb.predict(X_test)

In [53]:
# Report...
classification_rep = classification_report(y_test, pred_nb)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87        86
           1       0.88      0.90      0.89        98

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184



Model-6(LGBM)

In [54]:
from lightgbm import LGBMClassifier

In [55]:
model_lgbm=LGBMClassifier()

In [56]:
model_lgbm.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 410, number of negative: 324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 368
[LightGBM] [Info] Number of data points in the train set: 734, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.558583 -> initscore=0.235414
[LightGBM] [Info] Start training from score 0.235414


In [57]:
pred_lgbm=model_lgbm.predict(X_test)

In [58]:
# Report...
classification_rep = classification_report(y_test, pred_lgbm)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85        86
           1       0.86      0.88      0.87        98

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



Model-7(XG Boost)

In [59]:
from xgboost import XGBClassifier as XGB

In [60]:
model_xgb=XGB(n_estimators=10,max_depth=5, learning_rate=0.2)

In [61]:
model_xgb.fit(X_train,y_train)

In [62]:
pred_xgb=model_xgb.predict(X_test)

In [63]:
# Report...
classification_rep = classification_report(y_test, pred_xgb)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.79      0.82        86
           1       0.83      0.89      0.86        98

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184



Ensemble_model(choosing best ones and combining them..

Best models so far are...
1. Random Forest.
2.Naive Bayes.
3. LGBM.
4. XG Boost

In [64]:
from sklearn.ensemble import VotingClassifier

In [65]:
model_ensemble=VotingClassifier([('xgb',XGB(n_estimators=400, max_depth=5)),
                       ('rf',RandomForestClassifier(random_state=0)),
                       ('lgbm',LGBMClassifier()),
                       ('gnb',GaussianNB())])

In [66]:
model_ensemble.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 410, number of negative: 324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 368
[LightGBM] [Info] Number of data points in the train set: 734, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.558583 -> initscore=0.235414
[LightGBM] [Info] Start training from score 0.235414


In [67]:
pred_ensemble=model_ensemble.predict(X_test)

In [68]:
# Report...
classification_rep = classification_report(y_test, pred_ensemble)
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86        86
           1       0.89      0.87      0.88        98

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



Using Deep Learning Models...

In [69]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [70]:


# Define a simple feedforward neural network
model_ann = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(15,)),
    # layers.Dense(64, activation='relu'),# Adjust input shape based on your features
    layers.Dense(2, activation='sigmoid')  # For binary classification
])


# Compile the model
model_ann.compile(optimizer='adam',  # You can adjust the optimizer and learning rate
              loss='binary_crossentropy',  # Binary Cross-Entropy Loss
              metrics=['accuracy'],
              )  # You can add more metrics as needed

# Print the model summary
model_ann.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               4096      
                                                                 
 dense_1 (Dense)             (None, 2)                 514       
                                                                 
Total params: 4610 (18.01 KB)
Trainable params: 4610 (18.01 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [71]:

import numpy as np


num_samples = len(y_train)
target_matrix = np.zeros((num_samples, 2))  # Initialize with zeros, assuming two classes

# Fill the target matrix with binary labels
target_matrix[np.arange(num_samples), y_train] = 1

# Now, target_matrix contains the two-dimensional target for neural network training
print(target_matrix)

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [72]:

# You can now load your dataset, split it into training and testing sets,
# and train the model using your data. Below is a sample code snippet
# to show how you might train the model:

# Sample data (replace with your own data)
import numpy as np


# Train the model
model_ann.fit(X_train,target_matrix, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7e0928424ca0>

In [73]:
pred_ann=model_ann.predict(X_test)



In [74]:
pred_ann

array([[6.66653458e-03, 9.87623513e-01],
       [1.99081168e-01, 6.84882820e-01],
       [5.62518060e-01, 3.27618271e-01],
       [1.24918237e-01, 8.14137816e-01],
       [3.36355716e-02, 9.44718122e-01],
       [3.30876887e-01, 5.25066674e-01],
       [1.83000356e-01, 6.15132868e-01],
       [3.69567759e-02, 9.35609877e-01],
       [1.25077851e-02, 9.82550085e-01],
       [8.07039253e-03, 9.86000001e-01],
       [7.40321027e-03, 9.81894553e-01],
       [1.30773306e-01, 8.12420249e-01],
       [7.55318761e-01, 1.77123919e-01],
       [1.07183848e-02, 9.86948431e-01],
       [9.88620985e-03, 9.85090971e-01],
       [2.67806407e-02, 9.62934017e-01],
       [2.09063869e-02, 9.63049948e-01],
       [7.63587374e-03, 9.83598709e-01],
       [5.42722642e-02, 8.50750327e-01],
       [9.51685786e-01, 2.10423861e-02],
       [8.55279039e-04, 9.98917758e-01],
       [7.44220197e-01, 1.79166853e-01],
       [8.23232066e-03, 9.88943875e-01],
       [7.52062798e-01, 1.45167902e-01],
       [6.653667

In [75]:
# Report...
classification_rep = classification_report(y_test, np.argmax(pred_ann,axis=1))
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.73      0.80        86
           1       0.80      0.92      0.85        98

    accuracy                           0.83       184
   macro avg       0.84      0.83      0.83       184
weighted avg       0.84      0.83      0.83       184

