1. Perform Feature Preprocessing on the Dataset
2. Convert Categorical Features from the Original Dataset into One-Hot Encoding for Easier Processing by Machine Learning Models

# Import package and dataset

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [2]:
df = pd.read_csv("heart.csv")
df.shape

(1025, 14)

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# Check the Data Types of Each Column's Features

In [4]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

# Change abbreviation to full feature name

In [5]:
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholestoral', 'fasting_blood_sugar', 'resting_electrocardiographic', 
              'maximum_heart_rate', 'exercise_induced_angina', 'ST_depression', 'ST_slope', 'major_vessels_num', 'thalassemia', 'target']
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,resting_electrocardiographic,maximum_heart_rate,exercise_induced_angina,ST_depression,ST_slope,major_vessels_num,thalassemia,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# Convert Categorical Features from Integer Encoding to Their Corresponding Strings

In [6]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['chest_pain_type'][df['chest_pain_type'] == 0] = 'typical angina'
df['chest_pain_type'][df['chest_pain_type'] == 1] = 'atypical angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'non-anginal pain'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'asymptomatic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

df['resting_electrocardiographic'][df['resting_electrocardiographic'] == 0] = 'normal'
df['resting_electrocardiographic'][df['resting_electrocardiographic'] == 1] = 'ST-T wave abnormality'
df['resting_electrocardiographic'][df['resting_electrocardiographic'] == 2] = 'left ventricular hypertrophy'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['ST_slope'][df['ST_slope'] == 0] = 'upsloping'
df['ST_slope'][df['ST_slope'] == 1] = 'flat'
df['ST_slope'][df['ST_slope'] == 2] = 'downsloping'

df['thalassemia'][df['thalassemia'] == 0] = 'unknown'
df['thalassemia'][df['thalassemia'] == 1] = 'normal'
df['thalassemia'][df['thalassemia'] == 2] = 'fixed defect'
df['thalassemia'][df['thalassemia'] == 3] = 'reversable defect'

df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,resting_electrocardiographic,maximum_heart_rate,exercise_induced_angina,ST_depression,ST_slope,major_vessels_num,thalassemia,target
0,52,male,typical angina,125,212,lower than 120mg/ml,ST-T wave abnormality,168,no,1.0,downsloping,2,reversable defect,0
1,53,male,typical angina,140,203,greater than 120mg/ml,normal,155,yes,3.1,upsloping,0,reversable defect,0
2,70,male,typical angina,145,174,lower than 120mg/ml,ST-T wave abnormality,125,yes,2.6,upsloping,0,reversable defect,0
3,61,male,typical angina,148,203,lower than 120mg/ml,ST-T wave abnormality,161,no,0.0,downsloping,1,reversable defect,0
4,62,female,typical angina,138,294,greater than 120mg/ml,ST-T wave abnormality,106,no,1.9,flat,3,fixed defect,0


In [7]:
df.columns

Index(['age', 'sex', 'chest_pain_type', 'resting_blood_pressure',
       'cholestoral', 'fasting_blood_sugar', 'resting_electrocardiographic',
       'maximum_heart_rate', 'exercise_induced_angina', 'ST_depression',
       'ST_slope', 'major_vessels_num', 'thalassemia', 'target'],
      dtype='object')

In [8]:
df.dtypes

age                               int64
sex                              object
chest_pain_type                  object
resting_blood_pressure            int64
cholestoral                       int64
fasting_blood_sugar              object
resting_electrocardiographic     object
maximum_heart_rate                int64
exercise_induced_angina          object
ST_depression                   float64
ST_slope                         object
major_vessels_num                 int64
thalassemia                      object
target                            int64
dtype: object

# Convert eigenvalues to one-hot encoding

In [9]:
df = pd.get_dummies(df)
df.columns

Index(['age', 'resting_blood_pressure', 'cholestoral', 'maximum_heart_rate',
       'ST_depression', 'major_vessels_num', 'target', 'sex_female',
       'sex_male', 'chest_pain_type_asymptomatic',
       'chest_pain_type_atypical angina', 'chest_pain_type_non-anginal pain',
       'chest_pain_type_typical angina',
       'fasting_blood_sugar_greater than 120mg/ml',
       'fasting_blood_sugar_lower than 120mg/ml',
       'resting_electrocardiographic_ST-T wave abnormality',
       'resting_electrocardiographic_left ventricular hypertrophy',
       'resting_electrocardiographic_normal', 'exercise_induced_angina_no',
       'exercise_induced_angina_yes', 'ST_slope_downsloping', 'ST_slope_flat',
       'ST_slope_upsloping', 'thalassemia_fixed defect', 'thalassemia_normal',
       'thalassemia_reversable defect', 'thalassemia_unknown'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,age,resting_blood_pressure,cholestoral,maximum_heart_rate,ST_depression,major_vessels_num,target,sex_female,sex_male,chest_pain_type_asymptomatic,...,resting_electrocardiographic_normal,exercise_induced_angina_no,exercise_induced_angina_yes,ST_slope_downsloping,ST_slope_flat,ST_slope_upsloping,thalassemia_fixed defect,thalassemia_normal,thalassemia_reversable defect,thalassemia_unknown
0,52,125,212,168,1.0,2,0,False,True,False,...,False,True,False,True,False,False,False,False,True,False
1,53,140,203,155,3.1,0,0,False,True,False,...,True,False,True,False,False,True,False,False,True,False
2,70,145,174,125,2.6,0,0,False,True,False,...,False,False,True,False,False,True,False,False,True,False
3,61,148,203,161,0.0,1,0,False,True,False,...,False,True,False,True,False,False,False,False,True,False
4,62,138,294,106,1.9,3,0,True,False,False,...,False,True,False,False,True,False,True,False,False,False


In [11]:
df.shape

(1025, 27)

In [12]:
df.iloc[0]

age                                                             52
resting_blood_pressure                                         125
cholestoral                                                    212
maximum_heart_rate                                             168
ST_depression                                                  1.0
major_vessels_num                                                2
target                                                           0
sex_female                                                   False
sex_male                                                      True
chest_pain_type_asymptomatic                                 False
chest_pain_type_atypical angina                              False
chest_pain_type_non-anginal pain                             False
chest_pain_type_typical angina                                True
fasting_blood_sugar_greater than 120mg/ml                    False
fasting_blood_sugar_lower than 120mg/ml                       

# Export the Processed Dataset to a CSV File

In [13]:
df.to_csv('process_heart.csv', index=False)