<h1>Data Processing</h1>
<h3>By Abby Roberts</h3>

<h2>Import Libraries</h2>

In [1]:
import numpy as np
import pandas as pd

<h2>Import Data</h2>

In [2]:
filepath = '../../Data/original/heart-disease.csv'
df = pd.read_csv(filepath)

In [3]:
"""
Columns: 
age, 
sex, 
chest pain (cp), 
resting blood pressure (trestbps), 
cholesterol (chol), 
fasting blood sugar (fbs), 
resting electrocardiographic result (restecg), 
maximum heart rate (thalach), 
exercise-induced angina (exang), 
ST depression (old peak), 
slope of peak ST segment (slope), 
number of major vessels (ca), 
thallium stress result (thal),
diagnosis of heart disease (num)
"""
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(1), int64(11), object(2)
memory usage: 33.3+ KB


<h2>Clean Data</h2>
<h3>Add target</h3>

In [5]:
# add target variable - if disease is greater than 1, has disease
df['target'] = np.where(df['num']==0, 0, 1)

In [6]:
# check value counts - should have 165 CVD, 138 normal
df['target'].value_counts()

target
0    164
1    139
Name: count, dtype: int64

<h3>Remove bad entries</h3>

In [7]:
df['ca'].value_counts()

ca
0    176
1     65
2     38
3     20
?      4
Name: count, dtype: int64

In [8]:
# remove ? rows
df = df[df['ca']!='?'].copy()

In [9]:
df['thal'].value_counts()

thal
3    164
7    115
6     18
?      2
Name: count, dtype: int64

In [10]:
# remove ? rows
df = df[df['thal']!='?'].copy()

<h3>Add dummies</h3>

In [11]:
# make sex into actual names for easier understanding
df['sex'] = np.where(df['sex']==1, 'male', 'female')

# add dummies
df_sex_dummies = pd.get_dummies(df['sex'], prefix='sex')
df = pd.concat([df, df_sex_dummies], axis=1)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,target,sex_female,sex_male
0,63,male,1,145,233,1,2,150,0,2.3,3,0,6,0,0,False,True
1,67,male,4,160,286,0,2,108,1,1.5,2,3,3,2,1,False,True
2,67,male,4,120,229,0,2,129,1,2.6,2,2,7,1,1,False,True
3,37,male,3,130,250,0,0,187,0,3.5,3,0,3,0,0,False,True
4,41,female,2,130,204,0,2,172,0,1.4,1,0,3,0,0,True,False


In [12]:
# make chest pain into actual diagnosis names for easier understanding
df['cp'] = np.select([df['cp']==1, df['cp']==2, df['cp']==3, df['cp']==4],
                     ['typical angina', 'atypical angina', 'non-anginal pain', 'asymptomatic'])

# add dummies
df_cp_dummies = pd.get_dummies(df['cp'], prefix='cp')
df = pd.concat([df, df_cp_dummies], axis=1)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,ca,thal,num,target,sex_female,sex_male,cp_asymptomatic,cp_atypical angina,cp_non-anginal pain,cp_typical angina
0,63,male,typical angina,145,233,1,2,150,0,2.3,...,0,6,0,0,False,True,False,False,False,True
1,67,male,asymptomatic,160,286,0,2,108,1,1.5,...,3,3,2,1,False,True,True,False,False,False
2,67,male,asymptomatic,120,229,0,2,129,1,2.6,...,2,7,1,1,False,True,True,False,False,False
3,37,male,non-anginal pain,130,250,0,0,187,0,3.5,...,0,3,0,0,False,True,False,False,True,False
4,41,female,atypical angina,130,204,0,2,172,0,1.4,...,0,3,0,0,True,False,False,True,False,False


In [13]:
# add marker for bad blood pressure (>=130)
df['trestbps_concerning'] = np.where(df['chol']>=130, 1, 0)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,thal,num,target,sex_female,sex_male,cp_asymptomatic,cp_atypical angina,cp_non-anginal pain,cp_typical angina,trestbps_concerning
0,63,male,typical angina,145,233,1,2,150,0,2.3,...,6,0,0,False,True,False,False,False,True,1
1,67,male,asymptomatic,160,286,0,2,108,1,1.5,...,3,2,1,False,True,True,False,False,False,1
2,67,male,asymptomatic,120,229,0,2,129,1,2.6,...,7,1,1,False,True,True,False,False,False,1
3,37,male,non-anginal pain,130,250,0,0,187,0,3.5,...,3,0,0,False,True,False,False,True,False,1
4,41,female,atypical angina,130,204,0,2,172,0,1.4,...,3,0,0,True,False,False,True,False,False,1


In [14]:
# add marker for bad cholesterol (>=200)
df['chol_concerning'] = np.where(df['chol']>=200, 1, 0)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,num,target,sex_female,sex_male,cp_asymptomatic,cp_atypical angina,cp_non-anginal pain,cp_typical angina,trestbps_concerning,chol_concerning
0,63,male,typical angina,145,233,1,2,150,0,2.3,...,0,0,False,True,False,False,False,True,1,1
1,67,male,asymptomatic,160,286,0,2,108,1,1.5,...,2,1,False,True,True,False,False,False,1,1
2,67,male,asymptomatic,120,229,0,2,129,1,2.6,...,1,1,False,True,True,False,False,False,1,1
3,37,male,non-anginal pain,130,250,0,0,187,0,3.5,...,0,0,False,True,False,False,True,False,1,1
4,41,female,atypical angina,130,204,0,2,172,0,1.4,...,0,0,True,False,False,True,False,False,1,1


In [15]:
# make resting ecg into actual diagnosis names for easier understanding
df['restecg'] = np.select([df['restecg']==0, df['restecg']==1, df['restecg']==2],
                          ['no issues', 'non-normal heart beat', 'enlarged left ventricular'])

# add dummies
df_restecg_dummies = pd.get_dummies(df['restecg'], prefix='restecg')
df = pd.concat([df, df_restecg_dummies], axis=1)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,sex_male,cp_asymptomatic,cp_atypical angina,cp_non-anginal pain,cp_typical angina,trestbps_concerning,chol_concerning,restecg_enlarged left ventricular,restecg_no issues,restecg_non-normal heart beat
0,63,male,typical angina,145,233,1,enlarged left ventricular,150,0,2.3,...,True,False,False,False,True,1,1,True,False,False
1,67,male,asymptomatic,160,286,0,enlarged left ventricular,108,1,1.5,...,True,True,False,False,False,1,1,True,False,False
2,67,male,asymptomatic,120,229,0,enlarged left ventricular,129,1,2.6,...,True,True,False,False,False,1,1,True,False,False
3,37,male,non-anginal pain,130,250,0,no issues,187,0,3.5,...,True,False,False,True,False,1,1,False,True,False
4,41,female,atypical angina,130,204,0,enlarged left ventricular,172,0,1.4,...,False,False,True,False,False,1,1,True,False,False


In [16]:
# add marker for bad maximum heart rate (>=140)
df['thalach_concerning'] = np.where(df['thalach']>=140, 1, 0)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,cp_asymptomatic,cp_atypical angina,cp_non-anginal pain,cp_typical angina,trestbps_concerning,chol_concerning,restecg_enlarged left ventricular,restecg_no issues,restecg_non-normal heart beat,thalach_concerning
0,63,male,typical angina,145,233,1,enlarged left ventricular,150,0,2.3,...,False,False,False,True,1,1,True,False,False,1
1,67,male,asymptomatic,160,286,0,enlarged left ventricular,108,1,1.5,...,True,False,False,False,1,1,True,False,False,0
2,67,male,asymptomatic,120,229,0,enlarged left ventricular,129,1,2.6,...,True,False,False,False,1,1,True,False,False,0
3,37,male,non-anginal pain,130,250,0,no issues,187,0,3.5,...,False,False,True,False,1,1,False,True,False,1
4,41,female,atypical angina,130,204,0,enlarged left ventricular,172,0,1.4,...,False,True,False,False,1,1,True,False,False,1


In [17]:
# make slope into actual diagnosis names for easier understanding
df['slope'] = np.select([df['slope']==1, df['slope']==2],
                        ['typical', 'unhealthy'], default='upsloping')

# add dummies
df_slope_dummies = pd.get_dummies(df['slope'], prefix='slope')
df = pd.concat([df, df_slope_dummies], axis=1)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,cp_typical angina,trestbps_concerning,chol_concerning,restecg_enlarged left ventricular,restecg_no issues,restecg_non-normal heart beat,thalach_concerning,slope_typical,slope_unhealthy,slope_upsloping
0,63,male,typical angina,145,233,1,enlarged left ventricular,150,0,2.3,...,True,1,1,True,False,False,1,False,False,True
1,67,male,asymptomatic,160,286,0,enlarged left ventricular,108,1,1.5,...,False,1,1,True,False,False,0,False,True,False
2,67,male,asymptomatic,120,229,0,enlarged left ventricular,129,1,2.6,...,False,1,1,True,False,False,0,False,True,False
3,37,male,non-anginal pain,130,250,0,no issues,187,0,3.5,...,False,1,1,False,True,False,1,False,False,True
4,41,female,atypical angina,130,204,0,enlarged left ventricular,172,0,1.4,...,False,1,1,True,False,False,1,True,False,False


In [18]:
# add marker for number of major vessels colored by flouroscopy (==0)
df['ca_concerning'] = np.where(df['ca']=='0', 1, 0)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,trestbps_concerning,chol_concerning,restecg_enlarged left ventricular,restecg_no issues,restecg_non-normal heart beat,thalach_concerning,slope_typical,slope_unhealthy,slope_upsloping,ca_concerning
0,63,male,typical angina,145,233,1,enlarged left ventricular,150,0,2.3,...,1,1,True,False,False,1,False,False,True,1
1,67,male,asymptomatic,160,286,0,enlarged left ventricular,108,1,1.5,...,1,1,True,False,False,0,False,True,False,0
2,67,male,asymptomatic,120,229,0,enlarged left ventricular,129,1,2.6,...,1,1,True,False,False,0,False,True,False,0
3,37,male,non-anginal pain,130,250,0,no issues,187,0,3.5,...,1,1,False,True,False,1,False,False,True,1
4,41,female,atypical angina,130,204,0,enlarged left ventricular,172,0,1.4,...,1,1,True,False,False,1,True,False,False,1


In [19]:
# make thalium stress level into actual diagnosis names for easier understanding
df['thal'] = np.select([df['thal']=='3', df['thal']=='6', df['thal']=='7'],
                        ['normal', 'fixed defect', 'reversable defect'])

# add dummies
df_thal_dummies = pd.get_dummies(df['thal'], prefix='thal')
df = pd.concat([df, df_thal_dummies], axis=1)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,restecg_no issues,restecg_non-normal heart beat,thalach_concerning,slope_typical,slope_unhealthy,slope_upsloping,ca_concerning,thal_fixed defect,thal_normal,thal_reversable defect
0,63,male,typical angina,145,233,1,enlarged left ventricular,150,0,2.3,...,False,False,1,False,False,True,1,True,False,False
1,67,male,asymptomatic,160,286,0,enlarged left ventricular,108,1,1.5,...,False,False,0,False,True,False,0,False,True,False
2,67,male,asymptomatic,120,229,0,enlarged left ventricular,129,1,2.6,...,False,False,0,False,True,False,0,False,False,True
3,37,male,non-anginal pain,130,250,0,no issues,187,0,3.5,...,True,False,1,False,False,True,1,False,True,False
4,41,female,atypical angina,130,204,0,enlarged left ventricular,172,0,1.4,...,False,False,1,True,False,False,1,False,True,False


<h3>Update Data Types</h3>

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 34 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   age                                297 non-null    int64  
 1   sex                                297 non-null    object 
 2   cp                                 297 non-null    object 
 3   trestbps                           297 non-null    int64  
 4   chol                               297 non-null    int64  
 5   fbs                                297 non-null    int64  
 6   restecg                            297 non-null    object 
 7   thalach                            297 non-null    int64  
 8   exang                              297 non-null    int64  
 9   oldpeak                            297 non-null    float64
 10  slope                              297 non-null    object 
 11  ca                                 297 non-null    object 
 12 

In [21]:
# change ca to int
df['ca'] = df['ca'].astype(int)

In [22]:
# drop text columns
df.drop(columns=['sex', 'cp', 'restecg', 'slope', 'thal'], inplace=True)

In [23]:
# drop heart disease rating - only using target
df.drop(columns=['num'], inplace=True)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   age                                297 non-null    int64  
 1   trestbps                           297 non-null    int64  
 2   chol                               297 non-null    int64  
 3   fbs                                297 non-null    int64  
 4   thalach                            297 non-null    int64  
 5   exang                              297 non-null    int64  
 6   oldpeak                            297 non-null    float64
 7   ca                                 297 non-null    int32  
 8   target                             297 non-null    int32  
 9   sex_female                         297 non-null    bool   
 10  sex_male                           297 non-null    bool   
 11  cp_asymptomatic                    297 non-null    bool   
 12 

In [25]:
df.head()

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,sex_female,...,restecg_no issues,restecg_non-normal heart beat,thalach_concerning,slope_typical,slope_unhealthy,slope_upsloping,ca_concerning,thal_fixed defect,thal_normal,thal_reversable defect
0,63,145,233,1,150,0,2.3,0,0,False,...,False,False,1,False,False,True,1,True,False,False
1,67,160,286,0,108,1,1.5,3,1,False,...,False,False,0,False,True,False,0,False,True,False
2,67,120,229,0,129,1,2.6,2,1,False,...,False,False,0,False,True,False,0,False,False,True
3,37,130,250,0,187,0,3.5,0,0,False,...,True,False,1,False,False,True,1,False,True,False
4,41,130,204,0,172,0,1.4,0,0,True,...,False,False,1,True,False,False,1,False,True,False


<h2>Save the dataframe</h2>

In [26]:
# save dataframe to csv
df.to_csv('../../Data/processed/df_preprocessed.csv', index=False)