In [1]:
# Import modules
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
# Read csv file into a pandas DataFrame and review DataFrame
heart_attack_prediction_df = pd.read_csv(
    Path("Resources/heart_attack_prediction_dataset.csv"),
    index_col = "Patient ID"
)

heart_attack_prediction_df.head()

Unnamed: 0_level_0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BMW7812,67,Male,208,158/88,72,0,0,1,0,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
CZE1114,21,Male,389,165/93,98,1,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
BNI9906,21,Female,324,174/99,72,1,0,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
JLN3497,84,Male,383,163/100,73,1,1,1,0,1,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
GFO8847,66,Male,318,91/88,93,1,1,1,1,0,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [3]:
# Review and validate data types and null values with .info()
heart_attack_prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8763 entries, BMW7812 to ZWN9666
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Sex                              8763 non-null   object 
 2   Cholesterol                      8763 non-null   int64  
 3   Blood Pressure                   8763 non-null   object 
 4   Heart Rate                       8763 non-null   int64  
 5   Diabetes                         8763 non-null   int64  
 6   Family History                   8763 non-null   int64  
 7   Smoking                          8763 non-null   int64  
 8   Obesity                          8763 non-null   int64  
 9   Alcohol Consumption              8763 non-null   int64  
 10  Exercise Hours Per Week          8763 non-null   float64
 11  Diet                             8763 non-null   object 
 12  Previous Heart P

In [4]:
# Check for duplicate rows
duplicate = heart_attack_prediction_df[heart_attack_prediction_df.duplicated()]
duplicate

Unnamed: 0_level_0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [5]:
# Drop unneccessary columns
# Drop 'Country' column, as one-hot encoding will result in 20 additional features which are too granular to add predictive value to model
heart_attack_prediction_df.drop(['Blood Pressure', 'Country'], axis=1, inplace=True)
heart_attack_prediction_df

Unnamed: 0_level_0,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Continent,Hemisphere,Heart Attack Risk
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BMW7812,67,Male,208,72,0,0,1,0,0,4.168189,...,9,6.615001,261404,31.251233,286,0,6,South America,Southern Hemisphere,0
CZE1114,21,Male,389,98,1,1,1,1,1,1.813242,...,1,4.963459,285768,27.194973,235,1,7,North America,Northern Hemisphere,0
BNI9906,21,Female,324,72,1,0,0,0,0,2.078353,...,9,9.463426,235282,28.176571,587,4,4,Europe,Northern Hemisphere,0
JLN3497,84,Male,383,73,1,1,1,0,1,9.828130,...,9,7.648981,125640,36.464704,378,3,4,North America,Northern Hemisphere,0
GFO8847,66,Male,318,93,1,1,1,1,0,5.804299,...,6,1.514821,160555,21.809144,231,1,5,Asia,Northern Hemisphere,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSV9918,60,Male,121,61,1,1,1,0,1,7.917342,...,8,10.806373,235420,19.655895,67,7,7,Asia,Northern Hemisphere,0
QSV6764,28,Female,120,73,1,0,0,1,0,16.558426,...,8,3.833038,217881,23.993866,617,4,9,North America,Northern Hemisphere,0
XKA5925,47,Male,250,105,0,1,1,1,1,3.148438,...,5,2.375214,36998,35.406146,527,4,4,South America,Southern Hemisphere,1
EPE6801,36,Male,178,60,1,0,1,0,0,3.789950,...,5,0.029104,209943,27.294020,114,2,8,South America,Southern Hemisphere,0


In [6]:
# Transform categorical/nominal variables/columns with get_dummies
df = pd.get_dummies(heart_attack_prediction_df, columns = ['Sex', 'Continent', 'Hemisphere'])
df.head()

Unnamed: 0_level_0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,...,Sex_Female,Sex_Male,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BMW7812,67,208,72,0,0,1,0,0,4.168189,Average,...,0,1,0,0,0,0,0,1,0,1
CZE1114,21,389,98,1,1,1,1,1,1.813242,Unhealthy,...,0,1,0,0,0,0,1,0,1,0
BNI9906,21,324,72,1,0,0,0,0,2.078353,Healthy,...,1,0,0,0,0,1,0,0,1,0
JLN3497,84,383,73,1,1,1,0,1,9.82813,Average,...,0,1,0,0,0,0,1,0,1,0
GFO8847,66,318,93,1,1,1,1,0,5.804299,Unhealthy,...,0,1,0,1,0,0,0,0,1,0


In [7]:
# Apply label encoding to 'Diet' column
Diet = {"Unhealthy": 0, "Average": 1, "Healthy": 2}
df = df.replace({"Diet": Diet})
df['Diet'].head()

Patient ID
BMW7812    1
CZE1114    0
BNI9906    2
JLN3497    1
GFO8847    0
Name: Diet, dtype: int64

In [8]:
# Scale numeric columns
scaled_data = StandardScaler().fit_transform(df[['Age', 
                                                 'Cholesterol', 
                                                 'Heart Rate', 
                                                 'Exercise Hours Per Week', 
                                                 'Stress Level', 
                                                 'Sedentary Hours Per Day',
                                                 'Income',
                                                 'BMI',
                                                 'Triglycerides',
                                                 'Physical Activity Days Per Week',
                                                 'Sleep Hours Per Day'
                                                      ]])
# Review scaled data
scaled_data                                                     

array([[ 6.25557131e-01, -6.41578894e-01, -1.47042098e-01, ...,
        -5.88539270e-01, -1.52884347e+00, -5.14749993e-01],
       [-1.53932232e+00,  1.59689495e+00,  1.11817855e+00, ...,
        -8.16487136e-01, -1.09073833e+00, -1.18227783e-02],
       [-1.53932232e+00,  7.93023127e-01, -1.47042098e-01, ...,
         7.56800093e-01,  2.23577110e-01, -1.52060442e+00],
       ...,
       [-3.15694803e-01, -1.22154025e-01,  1.45881488e+00, ...,
         4.88626134e-01,  2.23577110e-01, -1.52060442e+00],
       [-8.33383367e-01, -1.01259666e+00, -7.30990089e-01, ...,
        -1.35730462e+00, -6.52633182e-01,  4.91104436e-01],
       [-1.35107193e+00,  1.18877541e+00, -1.05510022e-03, ...,
        -1.06231327e+00,  1.53789255e+00, -1.52060442e+00]])

In [9]:
# Create DataFrame of scaled data
scaled_df = pd.DataFrame(scaled_data, columns = ['Age', 
                                                 'Cholesterol', 
                                                 'Heart Rate', 
                                                 'Exercise Hours Per Week', 
                                                 'Stress Level', 
                                                 'Sedentary Hours Per Day',
                                                 'Income',
                                                 'BMI',
                                                 'Triglycerides',
                                                 'Physical Activity Days Per Week',
                                                 'Sleep Hours Per Day'
                                                ])

# Replace original columns with scaled columns
df['Age'] = scaled_df['Age'].values
df['Cholesterol'] = scaled_df['Cholesterol'].values
df['Heart Rate'] = scaled_df['Heart Rate'].values
df['Exercise Hours Per Week'] = scaled_df['Exercise Hours Per Week'].values
df['Stress Level'] = scaled_df['Stress Level'].values
df['Sedentary Hours Per Day'] = scaled_df['Sedentary Hours Per Day'].values
df['Income'] = scaled_df['Income'].values
df['BMI'] = scaled_df['BMI'].values
df['Triglycerides'] = scaled_df['Triglycerides'].values
df['Physical Activity Days Per Week'] = scaled_df['Physical Activity Days Per Week'].values
df['Sleep Hours Per Day'] = scaled_df['Sleep Hours Per Day'].values

# # Review DataFrame
df.head()


Unnamed: 0_level_0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,...,Sex_Female,Sex_Male,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BMW7812,0.625557,-0.641579,-0.147042,0,0,1,0,0,-1.010838,1,...,0,1,0,0,0,0,0,1,0,1
CZE1114,-1.539322,1.596895,1.118179,1,1,1,1,1,-1.418027,0,...,0,1,0,0,0,0,1,0,1,0
BNI9906,-1.539322,0.793023,-0.147042,1,0,0,0,0,-1.372188,2,...,1,0,0,0,0,1,0,0,1,0
JLN3497,1.425621,1.522691,-0.09838,1,1,1,0,1,-0.032188,1,...,0,1,0,0,0,0,1,0,1,0
GFO8847,0.578495,0.71882,0.874867,1,1,1,1,0,-0.727941,0,...,0,1,0,1,0,0,0,0,1,0


In [10]:
# Move target column to end of DataFrame
df['Heart Attack Risk'] = df.pop('Heart Attack Risk')
df.head()

Unnamed: 0_level_0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,...,Sex_Male,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere,Heart Attack Risk
Patient ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BMW7812,0.625557,-0.641579,-0.147042,0,0,1,0,0,-1.010838,1,...,1,0,0,0,0,0,1,0,1,0
CZE1114,-1.539322,1.596895,1.118179,1,1,1,1,1,-1.418027,0,...,1,0,0,0,0,1,0,1,0,0
BNI9906,-1.539322,0.793023,-0.147042,1,0,0,0,0,-1.372188,2,...,0,0,0,0,1,0,0,1,0,0
JLN3497,1.425621,1.522691,-0.09838,1,1,1,0,1,-0.032188,1,...,1,0,0,0,0,1,0,1,0,0
GFO8847,0.578495,0.71882,0.874867,1,1,1,1,0,-0.727941,0,...,1,0,1,0,0,0,0,1,0,0


In [11]:
# Export DataFrame as csv to Output folder
df.to_csv('Output/preprocessed_heart_attack_prediction_dataset.csv')