# Framingham Heart Disease Risk Study
 ## Preprocessing and Feature Engineering

In [17]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [18]:
# Import dataset
path = '../data/interim/data_eda.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,male,age,education,BPMeds,prevalentStroke,totChol,BMI,heartRate,TenYearCHD,diabetes_stage,smoker_class,MAP,hypertension_stage
0,1,39,4,0,0,195.0,26.97,80.0,0,0,0,82.0,0
1,0,46,2,0,0,250.0,28.73,95.0,0,0,0,94.33,0
2,1,48,1,0,0,245.0,25.34,75.0,0,0,2,95.83,0
3,0,61,3,0,0,225.0,28.58,65.0,1,1,3,113.33,1
4,0,46,3,0,0,285.0,23.1,85.0,0,0,3,99.33,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3656 entries, 0 to 3655
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   male                3656 non-null   int64  
 1   age                 3656 non-null   int64  
 2   education           3656 non-null   int64  
 3   BPMeds              3656 non-null   int64  
 4   prevalentStroke     3656 non-null   int64  
 5   totChol             3656 non-null   float64
 6   BMI                 3656 non-null   float64
 7   heartRate           3656 non-null   float64
 8   TenYearCHD          3656 non-null   int64  
 9   diabetes_stage      3656 non-null   int64  
 10  smoker_class        3656 non-null   int64  
 11  MAP                 3656 non-null   float64
 12  hypertension_stage  3656 non-null   int64  
dtypes: float64(4), int64(9)
memory usage: 371.4 KB


## Create dummy variables for all categorical columns
- ToDo:
    - `education`
    - `diabetes_stage`
    - `smoker_class`
    - `hypertension_stage`

- Done:
    - `male`
    - `BPMeds`
    - `prevalentStroke`

In [20]:
# Encode education level from 0 to 3 instead of 1 to 4
df['education'] = df['education'] - 1

In [21]:
# Get dummy variables for education, diabetes_stage, smoker_class, hypertension_stage
df = pd.get_dummies(df, columns=['education', 'diabetes_stage', 'smoker_class', 'hypertension_stage'], drop_first=True, dtype=int)

df.head()

Unnamed: 0,male,age,BPMeds,prevalentStroke,totChol,BMI,heartRate,TenYearCHD,MAP,education_1,education_2,education_3,diabetes_stage_1,diabetes_stage_2,smoker_class_1,smoker_class_2,smoker_class_3,hypertension_stage_1,hypertension_stage_2,hypertension_stage_3
0,1,39,0,0,195.0,26.97,80.0,0,82.0,0,0,1,0,0,0,0,0,0,0,0
1,0,46,0,0,250.0,28.73,95.0,0,94.33,1,0,0,0,0,0,0,0,0,0,0
2,1,48,0,0,245.0,25.34,75.0,0,95.83,0,0,0,0,0,0,1,0,0,0,0
3,0,61,0,0,225.0,28.58,65.0,1,113.33,0,1,0,1,0,0,0,1,1,0,0
4,0,46,0,0,285.0,23.1,85.0,0,99.33,0,1,0,0,0,0,0,1,0,0,0


In [23]:
# Change the order of the columns
categorical_columns = ['male', 
                    'BPMeds', 
                    'prevalentStroke', 
                    'education_1',
                    'education_2',
                    'education_3',
                    'diabetes_stage_1',
                    'diabetes_stage_2',
                    'smoker_class_1',
                    'smoker_class_2',
                    'smoker_class_3',
                    'hypertension_stage_1',
                    'hypertension_stage_2',
                    'hypertension_stage_3']

continuous_columns = ['age',
                    'totChol',
                    'BMI',
                    'heartRate',
                    'MAP']

target = ['TenYearCHD']

df = df[categorical_columns + continuous_columns + target]
df.head()

Unnamed: 0,male,BPMeds,prevalentStroke,education_1,education_2,education_3,diabetes_stage_1,diabetes_stage_2,smoker_class_1,smoker_class_2,smoker_class_3,hypertension_stage_1,hypertension_stage_2,hypertension_stage_3,age,totChol,BMI,heartRate,MAP,TenYearCHD
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,39,195.0,26.97,80.0,82.0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,46,250.0,28.73,95.0,94.33,0
2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,48,245.0,25.34,75.0,95.83,0
3,0,0,0,0,1,0,1,0,0,0,1,1,0,0,61,225.0,28.58,65.0,113.33,1
4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,46,285.0,23.1,85.0,99.33,0


## Standardize (scale) the continuous variables

In [32]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df[continuous_columns])
scaled_df = pd.DataFrame(scaled_df, columns=continuous_columns)

df[continuous_columns] = scaled_df
df.head()

Unnamed: 0,male,BPMeds,prevalentStroke,education_1,education_2,education_3,diabetes_stage_1,diabetes_stage_2,smoker_class_1,smoker_class_2,smoker_class_3,hypertension_stage_1,hypertension_stage_2,hypertension_stage_3,age,totChol,BMI,heartRate,MAP,TenYearCHD
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,-1.233351,-0.949714,0.291688,0.35634,-1.199371,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.415591,0.297729,0.724614,1.608289,-0.349343,0
2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,-0.181945,0.184325,-0.109261,-0.060977,-0.245934,0
3,0,0,0,0,1,0,1,0,0,0,1,1,0,0,1.336754,-0.269291,0.687717,-0.89561,0.960513,1
4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,-0.415591,1.091556,-0.660258,0.773656,-0.004644,0


## Train-Test Split the Data

In [34]:
# Train-test split the data
X = df.drop('TenYearCHD', axis=1)
y = df['TenYearCHD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Check the shape of the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2924, 19), (732, 19), (2924,), (732,))

In [36]:
# Save the data
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print('Data saved successfully!')

Data saved successfully!
