### Heart Disease Prediction

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


from sklearn.metrics import mean_squared_error, accuracy_score

## Data processing

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
df.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [6]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
# df.hist(bins=50, figsize=(10, 8))

In [8]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [9]:
# removing null value
drop_column = ['ca', 'slope', 'thal']
a = df.dropna( subset=drop_column)
a.shape
df.drop(drop_column,axis = 1) 

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
0,63,1,3,145,233,1,0,150,0,2.3,1
1,37,1,2,130,250,0,1,187,0,3.5,1
2,41,0,1,130,204,0,0,172,0,1.4,1
3,56,1,1,120,236,0,1,178,0,0.8,1
4,57,0,0,120,354,0,1,163,1,0.6,1
...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,0
299,45,1,3,110,264,0,1,132,0,1.2,0
300,68,1,0,144,193,1,1,141,0,3.4,0
301,57,1,0,130,131,0,1,115,1,1.2,0


In [10]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [12]:
# missing values

df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [13]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [14]:
df["cp"].value_counts()

cp
0    143
2     87
1     50
3     23
Name: count, dtype: int64

## Dummies using pandas to convert catagorical values to one-hot encoding

In [15]:
df = pd.get_dummies(df, columns=["cp", "restecg"])
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2
0,63,1,145,233,1,150,0,2.3,0,0,1,1,False,False,False,True,True,False,False
1,37,1,130,250,0,187,0,3.5,0,0,2,1,False,False,True,False,False,True,False
2,41,0,130,204,0,172,0,1.4,2,0,2,1,False,True,False,False,True,False,False
3,56,1,120,236,0,178,0,0.8,2,0,2,1,False,True,False,False,False,True,False
4,57,0,120,354,0,163,1,0.6,2,0,2,1,True,False,False,False,False,True,False


In [16]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2
0,63,1,145,233,1,150,0,2.3,0,0,1,1,False,False,False,True,True,False,False
1,37,1,130,250,0,187,0,3.5,0,0,2,1,False,False,True,False,False,True,False
2,41,0,130,204,0,172,0,1.4,2,0,2,1,False,True,False,False,True,False,False
3,56,1,120,236,0,178,0,0.8,2,0,2,1,False,True,False,False,False,True,False
4,57,0,120,354,0,163,1,0.6,2,0,2,1,True,False,False,False,False,True,False


In [17]:
numerical_cols = ["age","trestbps","chol","thalach","oldpeak"]
cat_cols = list(set(df.columns) - set(numerical_cols) - {'target'})

In [18]:
df["oldpeak"].value_counts()

oldpeak
0.0    99
1.2    17
1.0    14
0.6    14
0.8    13
1.4    13
0.2    12
1.6    11
1.8    10
2.0     9
0.4     9
0.1     7
2.8     6
2.6     6
1.9     5
1.5     5
3.0     5
0.5     5
2.2     4
3.6     4
0.9     3
3.4     3
4.0     3
2.4     3
0.3     3
3.2     2
2.3     2
1.1     2
4.2     2
2.5     2
1.3     1
3.5     1
0.7     1
3.1     1
6.2     1
5.6     1
2.9     1
2.1     1
3.8     1
4.4     1
Name: count, dtype: int64

In [19]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2
0,63,1,145,233,1,150,0,2.3,0,0,1,1,False,False,False,True,True,False,False
1,37,1,130,250,0,187,0,3.5,0,0,2,1,False,False,True,False,False,True,False
2,41,0,130,204,0,172,0,1.4,2,0,2,1,False,True,False,False,True,False,False
3,56,1,120,236,0,178,0,0.8,2,0,2,1,False,True,False,False,False,True,False
4,57,0,120,354,0,163,1,0.6,2,0,2,1,True,False,False,False,False,True,False


In [20]:
cat_cols

['fbs',
 'cp_0',
 'sex',
 'thal',
 'cp_1',
 'cp_2',
 'restecg_2',
 'slope',
 'ca',
 'cp_3',
 'exang',
 'restecg_1',
 'restecg_0']

In [21]:
numerical_cols

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Splitting

In [22]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
print(f"Rows in train set: {len(df_train)}\nRows in test set: {len(df_test)}\n")

Rows in train set: 242
Rows in test set: 61



# Standardization




In [24]:
scaler = StandardScaler()
def get_features_and_target_arrays(df,numerical_cols,cat_cols,scaler):
    X_numeric_scaled = scaler.fit_transform(df[numerical_cols])
    X_catagorical = df[cat_cols].to_numpy()
    X = np.hstack((X_catagorical,X_numeric_scaled))
    y = df["target"]
    return X , y

In [25]:
X_train , y_train = get_features_and_target_arrays(df_train,cat_cols,numerical_cols,scaler)

## Model fitting & evaluation 

In [26]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train,y_train)
X_test , y_test = get_features_and_target_arrays(df_test,cat_cols,numerical_cols,scaler)
Y_lr_pred = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
print("The mean squared error is :" ,mean_squared_error(y_test,Y_lr_pred))
score_lr = round(accuracy_score(Y_lr_pred,y_test)*100,2)
print("The accuracy score achieved using Logistic Regression is: "+str(score_lr)+" %")

The mean squared error is : 0.11475409836065574
The accuracy score achieved using Logistic Regression is: 88.52 %


In [28]:
# Decision Tree Classifier

In [29]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
Y_dt_pred = dt.predict(X_test)

In [30]:
print("The mean squared error is :" ,mean_squared_error(y_test,Y_dt_pred))
score_dt = round(accuracy_score(Y_dt_pred,y_test)*100,2)
print("The accuracy score achieved using Decision Tree Classifier is: "+str(score_dt)+" %")

The mean squared error is : 0.14754098360655737
The accuracy score achieved using Decision Tree Classifier is: 85.25 %


In [31]:
# Random Forest Classifier

In [32]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
Y_rf_pred = rf.predict(X_test)

In [33]:
print("The mean squared error is :" ,mean_squared_error(y_test,Y_rf_pred))
score_rf = round(accuracy_score(Y_rf_pred,y_test)*100,2)
print("The accuracy score achieved using Random Forest Classifier is: "+str(score_rf)+" %")


The mean squared error is : 0.13114754098360656
The accuracy score achieved using Random Forest Classifier is: 86.89 %


In [34]:
# SVM Classifier

In [35]:
sv =  SVC()
sv.fit(X_train,y_train)
Y_sv_pred = sv.predict(X_test)

In [36]:
print("The mean squared error is :" ,mean_squared_error(y_test,Y_sv_pred))
score_sv = round(accuracy_score(Y_sv_pred,y_test)*100,2)
print("The accuracy score achieved using SVM Classifier is: "+str(score_sv)+" %")


The mean squared error is : 0.29508196721311475
The accuracy score achieved using SVM Classifier is: 70.49 %


In [37]:
# K Nearest Neighbors

In [38]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train,y_train)
Y_knn_pred = knn.predict(X_test)

In [39]:
print("The mean squared error is :" ,mean_squared_error(y_test,Y_knn_pred))
score_knn = round(accuracy_score(Y_knn_pred,y_test)*100,2)
print("The accuracy score achieved using KNN is: "+str(score_knn)+" %")

The mean squared error is : 0.2786885245901639
The accuracy score achieved using KNN is: 72.13 %
