In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#for encoding
from sklearn.preprocessing import LabelEncoder
#for splitting
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

#for scores and evaluation 
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import LeaveOneOut, cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"D:\Aditya\data\Heart Disease\heart-2.csv")

In [3]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,Male,asymptomatic,145,233,high,normal,150,no,2.3,0,0,fixed deffect,1
1,37,Male,non anginal pain,130,250,normal,S-T abnormanilty,187,no,3.5,0,0,reversible deffect,1
2,41,Female,atypical angina,130,204,normal,normal,172,no,1.4,2,0,reversible deffect,1
3,56,Male,atypical angina,120,236,normal,S-T abnormanilty,178,no,0.8,2,0,reversible deffect,1
4,57,Female,typical angina,120,354,normal,S-T abnormanilty,163,yes,0.6,2,0,reversible deffect,1
5,57,Male,typical angina,140,192,normal,S-T abnormanilty,148,no,0.4,1,0,fixed deffect,1
6,56,Female,atypical angina,140,294,normal,normal,153,no,1.3,1,0,reversible deffect,1
7,44,Male,atypical angina,120,263,normal,S-T abnormanilty,173,no,0.0,2,0,deffect-3,1
8,52,Male,non anginal pain,172,199,high,S-T abnormanilty,162,no,0.5,2,0,deffect-3,1
9,57,Male,non anginal pain,150,168,normal,S-T abnormanilty,174,no,1.6,2,0,reversible deffect,1


In [4]:
df.shape

(303, 14)

In [5]:
#check if data is imbalanced
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
df.describe()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,131.623762,246.264026,149.646865,1.039604,1.39934,0.729373,0.544554
std,9.082101,17.538143,51.830751,22.905161,1.161075,0.616226,1.022606,0.498835
min,29.0,94.0,126.0,71.0,0.0,0.0,0.0,0.0
25%,47.5,120.0,211.0,133.5,0.0,1.0,0.0,0.0
50%,55.0,130.0,240.0,153.0,0.8,1.0,0.0,1.0
75%,61.0,140.0,274.5,166.0,1.6,2.0,1.0,1.0
max,77.0,200.0,564.0,202.0,6.2,2.0,4.0,1.0


In [8]:
df.describe(include = [np.object])

Unnamed: 0,sex,cp,fbs,restecg,exang,thal
count,303,303,303,303,303,303
unique,2,4,2,3,2,4
top,Male,typical angina,normal,S-T abnormanilty,no,reversible deffect
freq,207,143,258,152,204,166


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    object 
 2   cp        303 non-null    object 
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    object 
 6   restecg   303 non-null    object 
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    object 
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(1), int64(7), object(6)
memory usage: 33.3+ KB


In [10]:
#separating categorical and numerical attributes
cat, num = [], []

for i in df.columns:
    c = df.dtypes[i]
    if c == 'object':
        cat.append(i)
    else:
        num.append(i)

print('Categorical Values: {}\n'.format(len(cat)), cat, '\n')
print('Numerical Values: {}\n'.format(len(num)), num, '\n')

Categorical Values: 6
 ['sex', 'cp', 'fbs', 'restecg', 'exang', 'thal'] 

Numerical Values: 8
 ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'target'] 



In [12]:
corr = df.corr()
corr['target'].sort_values(ascending = False)

target      1.000000
thalach     0.421741
slope       0.345877
chol       -0.085239
trestbps   -0.144931
age        -0.225439
ca         -0.391724
oldpeak    -0.430696
Name: target, dtype: float64

In [13]:
# encoding categorical attributes - no one hot encoding for features having 10+ unique values (increases col and computation) or having <=2 unique values (causes dummy variable problem - introduces multicolinearity)
to_one_hot = [col for col in cat if df[col].nunique() <= 10 and df[col].nunique() > 2]
to_label = [col for col in cat if not col in to_one_hot]

print('to_one_hot: {}'.format(to_one_hot))
print('to_label: {}'.format(to_label))

to_one_hot: ['cp', 'restecg', 'thal']
to_label: ['sex', 'fbs', 'exang']


In [14]:
one_hot_encoded = pd.get_dummies(df[to_one_hot])
one_hot_encoded

Unnamed: 0,cp_asymptomatic,cp_atypical angina,cp_non anginal pain,cp_typical angina,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect
0,1,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,1
3,0,1,0,0,0,1,0,0,0,0,1
4,0,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
298,0,0,0,1,0,1,0,1,0,0,0
299,1,0,0,0,0,1,0,1,0,0,0
300,0,0,0,1,0,1,0,1,0,0,0
301,0,0,0,1,0,1,0,1,0,0,0


In [15]:
label_encoded = []

for i in to_label:
    #creating a different encoder for each column
    le = LabelEncoder()
    #creating new df for each encoded column
    col_df = pd.DataFrame(le.fit_transform(df[i]), columns = [i])
    #add the col_df to list
    label_encoded.append(col_df)
    
label_encoded = pd.concat(label_encoded, axis = 1)
label_encoded

Unnamed: 0,sex,fbs,exang
0,1,0,0
1,1,1,0
2,0,1,0
3,1,1,0
4,0,1,1
...,...,...,...
298,0,1,1
299,1,1,0
300,1,0,0
301,1,1,1


In [16]:
X = df.copy()
X.drop(cat, axis = 1, inplace = True)
X = pd.concat([X, one_hot_encoded, label_encoded], axis = 1)
X

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,target,cp_asymptomatic,cp_atypical angina,...,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect,sex,fbs,exang
0,63,145,233,150,2.3,0,0,1,1,0,...,0,0,1,0,1,0,0,1,0,0
1,37,130,250,187,3.5,0,0,1,0,0,...,0,1,0,0,0,0,1,1,1,0
2,41,130,204,172,1.4,2,0,1,0,1,...,0,0,1,0,0,0,1,0,1,0
3,56,120,236,178,0.8,2,0,1,0,1,...,0,1,0,0,0,0,1,1,1,0
4,57,120,354,163,0.6,2,0,1,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
299,45,110,264,132,1.2,1,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
300,68,144,193,141,3.4,1,2,0,0,0,...,0,1,0,1,0,0,0,1,0,0
301,57,130,131,115,1.2,1,1,0,0,0,...,0,1,0,1,0,0,0,1,1,1


In [17]:
X.drop(['target'], axis = 1, inplace = True)
y = df['target']
X

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,cp_asymptomatic,cp_atypical angina,cp_non anginal pain,...,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect,sex,fbs,exang
0,63,145,233,150,2.3,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,37,130,250,187,3.5,0,0,0,0,1,...,0,1,0,0,0,0,1,1,1,0
2,41,130,204,172,1.4,2,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
3,56,120,236,178,0.8,2,0,0,1,0,...,0,1,0,0,0,0,1,1,1,0
4,57,120,354,163,0.6,2,0,0,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
299,45,110,264,132,1.2,1,0,1,0,0,...,0,1,0,1,0,0,0,1,1,0
300,68,144,193,141,3.4,1,2,0,0,0,...,0,1,0,1,0,0,0,1,0,0
301,57,130,131,115,1.2,1,1,0,0,0,...,0,1,0,1,0,0,0,1,1,1


In [19]:
train_x, test_x, train_y, test_y = train_test_split(X, y, random_state = 101, test_size = 0.2)

In [20]:
##decision tree classification

dt = DecisionTreeClassifier()
#fit the model with train data
dt.fit(train_x, train_y)
#predict the values using test data
pred_dt = dt.predict(test_x)

print('Predicted: ', pred_dt[:10])

print('Actual: ', test_y[:10].to_list())

Predicted:  [0 1 1 1 0 1 0 0 0 1]
Actual:  [0, 1, 1, 1, 0, 1, 1, 0, 0, 1]


In [21]:
##random forest classification

rf = RandomForestClassifier()
rf.fit(train_x, train_y)
pred_rd = rf.predict(test_x)

print('Predicted: ', pred_rd[:10])
print('Actual: ', test_y[:10].to_list())

Predicted:  [0 1 1 1 0 1 1 0 0 1]
Actual:  [0, 1, 1, 1, 0, 1, 1, 0, 0, 1]


In [22]:
#logistic regression

log_reg = LogisticRegression()
log_reg.fit(train_x, train_y)
log_reg_pred = log_reg.predict(test_x)

print('Predicted: ', log_reg_pred[:10])
print('Actual: ', test_y[:10].to_list())

Predicted:  [0 1 1 1 0 1 1 0 0 1]
Actual:  [0, 1, 1, 1, 0, 1, 1, 0, 0, 1]


In [23]:
#K-Nearest-Neighbors classification

knn = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski')
knn.fit(train_x, train_y)
knn_pred = knn.predict(test_x)

print('Predicted: ', knn_pred[:10])
print('Actual: ', test_y[:10].to_list())

Predicted:  [0 1 1 1 0 1 1 0 0 1]
Actual:  [0, 1, 1, 1, 0, 1, 1, 0, 0, 1]


In [24]:
#xgb

xgb = XGBClassifier(n_estimators = 1000, learning_rate = 0.05)
xgb.fit(train_x, train_y, early_stopping_rounds = 5, eval_set = [(test_x, test_y)], verbose = False)
xgb_pred = xgb.predict(test_x)

print('Predicted: ', xgb_pred[:10])
print('Actual: ', test_y[:10].to_list())

Predicted:  [0 1 1 1 0 1 1 0 0 1]
Actual:  [0, 1, 1, 1, 0, 1, 1, 0, 0, 1]


In [25]:
cm_rf = confusion_matrix(test_y, pred_rd)
cm_dt = confusion_matrix(test_y, pred_dt)
cm_log_reg = confusion_matrix(test_y, log_reg_pred)
cm_knn = confusion_matrix(test_y, knn_pred)
cm_xgb = confusion_matrix(test_y, xgb_pred)

print('rf:\n', cm_rf, '\n\n', 'dt:\n', cm_dt, '\n\n', 'log_reg:\n', cm_log_reg, '\n\n', 'knn:\n', cm_knn, '\n\n', 'xgb:\n', cm_xgb)

rf:
 [[24  7]
 [ 1 29]] 

 dt:
 [[23  8]
 [ 2 28]] 

 log_reg:
 [[24  7]
 [ 2 28]] 

 knn:
 [[20 11]
 [ 7 23]] 

 xgb:
 [[22  9]
 [ 2 28]]


In [28]:
#printing accuracy scores
acc_rf = accuracy_score(test_y, pred_rd)
acc_dt = accuracy_score(test_y, pred_dt)
acc_log_reg = accuracy_score(test_y, log_reg_pred)
acc_knn = accuracy_score(test_y, knn_pred)
acc_xgb = accuracy_score(test_y, xgb_pred)

print('rf:\n', acc_rf, '\n\n', 'dt:\n', acc_dt, '\n\n', 'log_reg:\n', acc_log_reg, '\n\n', 'knn:\n', acc_knn, '\n\n', 'xgb:\n', acc_xgb)

rf:
 0.8688524590163934 

 dt:
 0.8360655737704918 

 log_reg:
 0.8524590163934426 

 knn:
 0.7049180327868853 

 xgb:
 0.819672131147541
