# HEART DISEASE PREDICTION

### "The objective of a heart disease prediction machine learning project is to develop a model that accurately identifies individuals at risk of heart disease, enabling early detection, personalized care, cost reduction, and public health insights."

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
import pickle

#### load data from data source

In [33]:
df = pd.read_csv("/home/aditi/Downloads/uncleaned.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,62.0,M,ASY,150.0,278.0,0.0,Normal,125.0,Y,0.0,Flat,1.0
1,54.0,F,ATA,150.0,236.0,0.0,Normal,171.0,N,0.0,Up,0.0
2,31.0,M,ASY,135.0,0.0,1.0,Normal,140.0,Y,1.0,Flat,1.0
3,55.0,M,ASY,135.0,0.0,0.0,Normal,130.0,Y,2.0,Flat,1.0
4,53.0,M,ASY,100.0,312.0,0.0,Normal,147.0,Y,1.6,Flat,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,44.0,M,ASY,184.0,336.0,0.0,ST,140.0,Y,0.0,Up,1.0
49996,31.0,M,ASY,95.0,199.0,1.0,Normal,189.0,,0.3,Up,0.0
49997,41.0,M,ATA,140.0,295.0,0.0,Normal,153.0,N,0.0,Up,0.0
49998,48.0,F,ASY,132.0,387.0,0.0,Normal,148.0,N,0.0,Flat,0.0


### EDA

In [34]:
df["HeartDisease"].value_counts()

HeartDisease
1.0    27642
0.0    22276
Name: count, dtype: int64

In [35]:
print("DATA INFORMATION :")
df.info()

DATA INFORMATION :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             49964 non-null  float64
 1   Sex             49936 non-null  object 
 2   ChestPainType   49920 non-null  object 
 3   RestingBP       49913 non-null  float64
 4   Cholesterol     49967 non-null  float64
 5   FastingBS       49988 non-null  float64
 6   RestingECG      49944 non-null  object 
 7   MaxHR           49970 non-null  float64
 8   ExerciseAngina  49953 non-null  object 
 9   Oldpeak         49993 non-null  float64
 10  ST_Slope        49933 non-null  object 
 11  HeartDisease    49918 non-null  float64
dtypes: float64(7), object(5)
memory usage: 4.6+ MB


In [36]:
print("DATA DESCRIPTION :")
df.describe()

DATA DESCRIPTION :


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,49964.0,49913.0,49967.0,49988.0,49970.0,49993.0,49918.0
mean,52.818369,132.584297,198.550423,0.220233,137.212928,0.868668,0.553748
std,10.902742,18.307277,110.258957,0.414408,25.438317,1.027399,0.497108
min,31.0,95.0,0.0,0.0,71.0,-1.0,0.0
25%,45.0,120.0,168.0,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.5,1.0
75%,61.0,140.0,268.0,0.0,157.0,1.5,1.0
max,75.0,190.0,468.0,1.0,190.0,4.2,1.0


#### CHECKING NULL VALUES

In [37]:
df.isna().sum()

Age               36
Sex               64
ChestPainType     80
RestingBP         87
Cholesterol       33
FastingBS         12
RestingECG        56
MaxHR             30
ExerciseAngina    47
Oldpeak            7
ST_Slope          67
HeartDisease      82
dtype: int64

In [38]:
df = df.dropna()

In [39]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [40]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,62.0,M,ASY,150.0,278.0,0.0,Normal,125.0,Y,0.0,Flat,1.0
1,54.0,F,ATA,150.0,236.0,0.0,Normal,171.0,N,0.0,Up,0.0
2,31.0,M,ASY,135.0,0.0,1.0,Normal,140.0,Y,1.0,Flat,1.0
3,55.0,M,ASY,135.0,0.0,0.0,Normal,130.0,Y,2.0,Flat,1.0
4,53.0,M,ASY,100.0,312.0,0.0,Normal,147.0,Y,1.6,Flat,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
49992,62.0,F,ASY,160.0,0.0,0.0,Normal,96.0,Y,1.7,Flat,1.0
49994,57.0,M,ATA,120.0,296.0,0.0,Normal,140.0,N,0.1,Up,0.0
49995,44.0,M,ASY,184.0,336.0,0.0,ST,140.0,Y,0.0,Up,1.0
49997,41.0,M,ATA,140.0,295.0,0.0,Normal,153.0,N,0.0,Up,0.0


### LABEL ENCODING

In [74]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

cols = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
df[cols] = df[cols].apply(LabelEncoder().fit_transform)

with open('LabelEncode.pkl', 'wb') as file:
    pickle.dump(df[cols], file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].apply(LabelEncoder().fit_transform)


In [42]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,62.0,1,0,150.0,278.0,0.0,1,125.0,1,0.0,1,1.0
1,54.0,0,1,150.0,236.0,0.0,1,171.0,0,0.0,2,0.0
2,31.0,1,0,135.0,0.0,1.0,1,140.0,1,1.0,1,1.0
3,55.0,1,0,135.0,0.0,0.0,1,130.0,1,2.0,1,1.0
4,53.0,1,0,100.0,312.0,0.0,1,147.0,1,1.6,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
49992,62.0,0,0,160.0,0.0,0.0,1,96.0,1,1.7,1,1.0
49994,57.0,1,1,120.0,296.0,0.0,1,140.0,0,0.1,2,0.0
49995,44.0,1,0,184.0,336.0,0.0,2,140.0,1,0.0,2,1.0
49997,41.0,1,1,140.0,295.0,0.0,1,153.0,0,0.0,2,0.0


In [66]:
df["Sex"].value_counts()

Sex
1    39266
0    10285
Name: count, dtype: int64

In [67]:
df["ChestPainType"].value_counts()

ChestPainType
0    27118
2    10688
1     9286
3     2459
Name: count, dtype: int64

In [68]:
df["RestingECG"].value_counts()

RestingECG
1    29890
0    10317
2     9344
Name: count, dtype: int64

In [69]:
df["ExerciseAngina"].value_counts()

ExerciseAngina
0    29061
1    20490
Name: count, dtype: int64

In [70]:
df["ST_Slope"].value_counts()

ST_Slope
1    24615
2    21788
0     3148
Name: count, dtype: int64

In [43]:
df.corr()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,0.01383,-0.082721,0.139792,-0.075544,0.156942,0.009391,-0.217792,0.162166,0.187943,-0.202814,0.230242
Sex,0.01383,1.0,-0.112279,0.014677,-0.142315,0.102782,0.052167,-0.117251,0.1452,0.075108,-0.113929,0.237377
ChestPainType,-0.082721,-0.112279,1.0,-0.018413,0.108405,-0.08276,-0.015884,0.195173,-0.28756,-0.185077,0.238131,-0.351894
RestingBP,0.139792,0.014677,-0.018413,1.0,0.046036,0.060831,0.00772,-0.064074,0.120921,0.103855,-0.078709,0.11232
Cholesterol,-0.075544,-0.142315,0.108405,0.046036,1.0,-0.280504,-0.177194,0.190318,-0.038776,0.011006,0.120495,-0.229189
FastingBS,0.156942,0.102782,-0.08276,0.060831,-0.280504,1.0,0.109319,-0.136218,0.067419,0.068787,-0.166308,0.253588
RestingECG,0.009391,0.052167,-0.015884,0.00772,-0.177194,0.109319,1.0,-0.092489,0.046887,-0.008498,-0.028405,0.056796
MaxHR,-0.217792,-0.117251,0.195173,-0.064074,0.190318,-0.136218,-0.092489,1.0,-0.327915,-0.178687,0.321563,-0.348655
ExerciseAngina,0.162166,0.1452,-0.28756,0.120921,-0.038776,0.067419,0.046887,-0.327915,1.0,0.381459,-0.416649,0.465825
Oldpeak,0.187943,0.075108,-0.185077,0.103855,0.011006,0.068787,-0.008498,-0.178687,0.381459,1.0,-0.445392,0.392392


In [44]:
x = df.drop("HeartDisease", axis = 1)
y = df["HeartDisease"]

In [45]:
# split x and y into training and testing dataset

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=12345)

### MODEL BUILDING

In [46]:
def build_lg():
    from sklearn.linear_model import LogisticRegressionCV
    model = LogisticRegressionCV()
    model.fit(x_train, y_train)

    return model

In [47]:
def build_knn():
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(x_train, y_train)

    return model

In [48]:
def build_svm():
    from sklearn.svm import SVC
    model = SVC()
    model.fit(x_train, y_train)

    return model

In [49]:
def build_nb():
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit(x_train, y_train)

    return model

In [50]:
def build_rf():
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()
    model.fit(x_train, y_train)

    return model

In [51]:
def build_dt():
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier()
    model.fit(x_train, y_train)

    return model

In [52]:
def build_cb():
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(verbose = True)
    model.fit(x_train, y_train)

    with open('Catboost.pkl', 'wb') as file:
        pickle.dump(model, file)

    return model

In [53]:
def build_xgb():
    from xgboost import XGBClassifier
    model = XGBClassifier()
    model.fit(x_train, y_train)

    return model

In [54]:
model_lg = build_lg()
model_nb = build_nb()
model_knn = build_knn()
model_svm = build_svm()
model_rf = build_rf()
model_dt = build_dt()
model_cb = build_cb()
model_xgb = build_xgb()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Learning rate set to 0.046835
0:	learn: 0.6558368	total: 52.7ms	remaining: 52.7s
1:	learn: 0.6237724	total: 56.1ms	remaining: 28s
2:	learn: 0.5943905	total: 59ms	remaining: 19.6s
3:	learn: 0.5674581	total: 62.1ms	remaining: 15.5s
4:	learn: 0.5431763	total: 64.9ms	remaining: 12.9s
5:	learn: 0.5227214	total: 67.7ms	remaining: 11.2s
6:	learn: 0.5052580	total: 70.2ms	remaining: 9.96s
7:	learn: 0.4889975	total: 76.7ms	remaining: 9.51s
8:	learn: 0.4745081	total: 79.5ms	remaining: 8.75s
9:	learn: 0.4616954	total: 82.5ms	remaining: 8.16s
10:	learn: 0.4499043	total: 85.4ms	remaining: 7.67s
11:	learn: 0.4395536	total: 88.1ms	remaining: 7.26s
12:	learn: 0.4306759	total: 90.9ms	remaining: 6.9s
13:	learn: 0.4226496	total: 93.7ms	remaining: 6.6s
14:	learn: 0.4152870	total: 96.7ms	remaining: 6.35s
15:	learn: 0.4085034	total: 99.5ms	remaining: 6.12s
16:	learn: 0.4027508	total: 102ms	remaining: 5.91s
17:	learn: 0.3973591	total: 105ms	remaining: 5.71s
18:	learn: 0.3932257	total: 107ms	remaining: 5.54s
1

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


### MODEL EVALUATION

In [55]:
def evaluate(model):
    y_true = y_test
    y_pred = model.predict(x_test)

    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

    y_true = y_test
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f1

In [56]:
print("LOGISTIC REGRESSION :")
evaluate(model_lg)

LOGISTIC REGRESSION :


(0.8420556975649133, 0.8527187612531509, 0.86370820668693, 0.8581783039381493)

In [57]:
print("NAIVE BAYES :")
evaluate(model_nb)

NAIVE BAYES :


(0.8366070227364456,
 0.8512727272727273,
 0.8538601823708206,
 0.8525644916540212)

In [58]:
print("K NEAREST NEIGHBOUR :")
evaluate(model_knn)

K NEAREST NEIGHBOUR :


(0.6823624377774788,
 0.7116616314199395,
 0.7159878419452887,
 0.7138181818181818)

In [59]:
print("SUPPORT VECTOR MACHINE :")
evaluate(model_svm)

SUPPORT VECTOR MACHINE :


(0.7657742499663662,
 0.7773359840954275,
 0.8081458966565349,
 0.7924415832141154)

In [60]:
print("RANDOM FOREST :")
evaluate(model_rf)

RANDOM FOREST :


(0.8515404278218754,
 0.8593264867446859,
 0.8748936170212765,
 0.8670401831435628)

In [61]:
print("DECISION TREES :")
evaluate(model_dt)

DECISION TREES :


(0.7831292883088927,
 0.8030541752514847,
 0.8055927051671733,
 0.8043214372420491)

In [62]:
print("CatBoost :")
evaluate(model_cb)

CatBoost :


(0.8600834118121888,
 0.8652086057292285,
 0.8849848024316109,
 0.8749849741555475)

In [63]:
print("XgBOOST :")
evaluate(model_xgb)

XgBOOST :


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


(0.8536929907170725, 0.8615825962228066, 0.8763525835866262, 0.868904827918751)

### MODEL REPORT

In [64]:
# create a list of functions
model_functions = [
    {"name": "Logistic Regression", "function": build_lg},
    {"name": "Naive Bayes", "function" : build_nb },
    {"name": "K Nearest Neighbour", "function" : build_knn},
    {"name": "Support Vector Machine", "function" : build_svm},
    {"name": "Decision Trees", "function" : build_dt},
    {"name": "Random Forest", "function": build_rf},
    {"name": "CatBoost", "function": build_cb}, 
    {"name": "XgBoost", "function" : build_xgb},
]

# empty list to collect the modelwise report
model_evaluation_report = []

# iterate over the list, create model and evaluate the model
for model_info in model_functions:
    model = model_info["function"]()
    metrics = evaluate(model)
    model_evaluation_report.append({
        "name": model_info["name"],
        "accuracy": metrics[0],
        "precision": metrics[1], 
        "recall": metrics[2],
        "f1": metrics[3]
    })

# create a data frame of the result
df_result = pd.DataFrame(model_evaluation_report)
df_result

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Learning rate set to 0.046835
0:	learn: 0.6558368	total: 4.25ms	remaining: 4.25s
1:	learn: 0.6237724	total: 8.88ms	remaining: 4.43s
2:	learn: 0.5943905	total: 13.5ms	remaining: 4.49s
3:	learn: 0.5674581	total: 21.8ms	remaining: 5.44s
4:	learn: 0.5431763	total: 26.9ms	remaining: 5.34s
5:	learn: 0.5227214	total: 31.4ms	remaining: 5.21s
6:	learn: 0.5052580	total: 35.7ms	remaining: 5.07s
7:	learn: 0.4889975	total: 40.3ms	remaining: 5s
8:	learn: 0.4745081	total: 44.5ms	remaining: 4.9s
9:	learn: 0.4616954	total: 49ms	remaining: 4.85s
10:	learn: 0.4499043	total: 54.5ms	remaining: 4.9s
11:	learn: 0.4395536	total: 60.3ms	remaining: 4.96s
12:	learn: 0.4306759	total: 65.5ms	remaining: 4.97s
13:	learn: 0.4226496	total: 69.9ms	remaining: 4.93s
14:	learn: 0.4152870	total: 76.2ms	remaining: 5s
15:	learn: 0.4085034	total: 81.2ms	remaining: 5s
16:	learn: 0.4027508	total: 92.8ms	remaining: 5.36s
17:	learn: 0.3973591	total: 101ms	remaining: 5.5s
18:	learn: 0.3932257	total: 105ms	remaining: 5.43s
19:	lear

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Unnamed: 0,name,accuracy,precision,recall,f1
0,Logistic Regression,0.842056,0.852719,0.863708,0.858178
1,Naive Bayes,0.836607,0.851273,0.85386,0.852564
2,K Nearest Neighbour,0.682362,0.711662,0.715988,0.713818
3,Support Vector Machine,0.765774,0.777336,0.808146,0.792442
4,Decision Trees,0.781918,0.801233,0.805714,0.803468
5,Random Forest,0.850397,0.858869,0.87307,0.865911
6,CatBoost,0.860083,0.865209,0.884985,0.874985
7,XgBoost,0.853693,0.861583,0.876353,0.868905
