<h1 align = 'center'> Predictive Heart Disease Modelling using EMR Data</h1>

<b>Avinash Pasupulate</b>
<br><a href = mailto:avinash.pasupulate@gmail.com>avinash.pasupulate@gmail.com</a>
<br><a href = https://www.linkedin.com/in/avinashpasupulate/>Linkedin</a>

In [331]:
#importing required packages
import re
import os
import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [332]:
#get a list of files in the data folder
cwd = os.getcwd()
os.listdir(cwd+r'/data')

['cleveland.data',
 'Index.1',
 'cleve.mod',
 'heart-disease.expense',
 'heart-disease.delay',
 'bak',
 'hungarian.data',
 'new.data',
 'switzerland.data',
 'ask-detrano',
 'heart-disease.cost',
 'long-beach-va.data',
 'processed.cleveland.data',
 'reprocessed.hungarian.data',
 'processed.switzerland.data',
 'heart-disease.README',
 'heart-disease.group',
 'Index',
 'heart-disease.names',
 'processed.hungarian.data',
 'processed.va.data']

In [333]:
#using the cleaveland dataset to build the model
#reading file with data description and data
with open(cwd+r'/data/cleve.mod', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('%'):
            print(line.rstrip('\n'))

% John Gennari
% 3/13/90
%
% This is Dr. Detrano's database modified to be a real MIXED dataset.
%
% Attributes: 8 symbolic, 6 numeric.
%  Age; sex; chest pain type (angina, abnang, notang, asympt)
%  Trestbps (resting blood pres); cholesteral; fasting blood sugar < 120
%  (true or false); resting ecg (norm, abn, hyper); max heart rate; 
%  exercise induced angina (true or false); oldpeak; slope (up, flat, down)
%  number of vessels colored (???); thal (norm, fixed, rever). Finally, the
%  class is either healthy (buff) or with heart-disease (sick).
%
% Original atts: 
%   age; sex (1,0); cp (1-4); trestbps; chol; fbs (1,0); restecg (0,1,2); 
%   thalach; exang (1,0); oldpeak; slope (1,2,3); ca; thal (3,6,7);
%   class att: 0 is healthy, 1,2,3,4 is sick.


In [334]:
#extracted data from the text file
l = ['age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target target1']
with open(cwd+r'/data/cleve.mod', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if (line.startswith('%')) or (len(line)==0):
            None
        else:
            l.extend([re.sub('\s+',' ',''.join(line.rstrip('\n')))])

In [335]:
df = pd.DataFrame([i.split(' ') for i in l])
df.columns = df.iloc[0]

In [336]:
df = df.drop(df.index[0:2])
df = df.drop(df.index[-1]).reset_index(drop = True)

#dropping target1 column 
df.drop('target1', axis = 1, inplace = True)

In [337]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,male,angina,145.0,233.0,true,hyp,150.0,fal,2.3,down,0.0,fix,buff
1,67.0,male,asympt,160.0,286.0,fal,hyp,108.0,true,1.5,flat,3.0,norm,sick
2,67.0,male,asympt,120.0,229.0,fal,hyp,129.0,true,2.6,flat,2.0,rev,sick
3,37.0,male,notang,130.0,250.0,fal,norm,187.0,fal,3.5,down,0.0,norm,buff
4,41.0,fem,abnang,130.0,204.0,fal,hyp,172.0,fal,1.4,up,0.0,norm,buff


In [338]:
#replacing missing values with na and converting dtypes for columns
df.replace('?', np.nan, inplace = True)
df['age'] = df['age'].astype(float, errors = 'ignore')
df['trestbps'] = df['trestbps'].astype(float, errors = 'ignore')
df['chol'] = df['chol'].astype(float, errors = 'ignore')
df['thalach'] = df['thalach'].astype(float, errors = 'ignore')
df['oldpeak'] = df['oldpeak'].astype(float, errors = 'ignore')
df['ca'] = df['ca'].astype(float, errors = 'ignore')

df['target'] = df['target'].astype('category', errors = 'ignore')
df['target'] = df['target'].cat.codes.astype(int)
df['sex'] = df['sex'].astype('category', errors = 'ignore')
df['sex'] = df['sex'].cat.codes.astype(int)
df['fbs'] = df['fbs'].astype('category', errors = 'ignore')
df['fbs'] = df['fbs'].cat.codes.astype(int)

In [339]:
df.dtypes

0
age         float64
sex           int64
cp           object
trestbps    float64
chol        float64
fbs           int64
restecg      object
thalach     float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
target        int64
dtype: object

In [340]:
#converting to dummy data
mod_df = pd.get_dummies(df, drop_first = True)

In [341]:
mod_df.columns

Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'oldpeak', 'ca',
       'target', 'cp_angina', 'cp_asympt', 'cp_notang', 'restecg_hyp',
       'restecg_norm', 'exang_true', 'slope_flat', 'slope_up', 'thal_norm',
       'thal_rev'],
      dtype='object')

In [342]:
mod_df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,oldpeak,ca,target,cp_angina,cp_asympt,cp_notang,restecg_hyp,restecg_norm,exang_true,slope_flat,slope_up,thal_norm,thal_rev
0,63.0,1,145.0,233.0,1,150.0,2.3,0.0,0,1,0,0,1,0,0,0,0,0,0
1,67.0,1,160.0,286.0,0,108.0,1.5,3.0,1,0,1,0,1,0,1,1,0,1,0
2,67.0,1,120.0,229.0,0,129.0,2.6,2.0,1,0,1,0,1,0,1,1,0,0,1
3,37.0,1,130.0,250.0,0,187.0,3.5,0.0,0,0,0,1,0,1,0,0,0,1,0
4,41.0,0,130.0,204.0,0,172.0,1.4,0.0,0,0,0,0,1,0,0,0,1,1,0


In [343]:
#replacing null values in the data frame

mod_df.loc[(mod_df.isnull().any(axis=1))&(mod_df['target']==0), 'ca'] = round(mod_df['ca'][mod_df['target']==0].mean())

mod_df.loc[(mod_df.isnull().any(axis=1))&(mod_df['target']==1), 'ca'] = round(mod_df['ca'][mod_df['target']==1].mean())


In [344]:
mod_df.dtypes

age             float64
sex               int64
trestbps        float64
chol            float64
fbs               int64
thalach         float64
oldpeak         float64
ca              float64
target            int64
cp_angina         uint8
cp_asympt         uint8
cp_notang         uint8
restecg_hyp       uint8
restecg_norm      uint8
exang_true        uint8
slope_flat        uint8
slope_up          uint8
thal_norm         uint8
thal_rev          uint8
dtype: object

In [345]:
#splitting data into train and test set with a random seed
x_train, x_test, y_train, y_test = train_test_split(mod_df.drop('target', axis = 1),
                                                    mod_df['target'], random_state = 28,
                                                    test_size = 0.2)