In [1]:
#imports
import pandas as pd

health = pd.read_csv('cleaned_heart_2020.csv')

In [2]:
health.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,GenHealth_Number,SleepGroups,BMIGroups
0,0,16.6,1,0,0,3.0,30.0,0,1,55-59,...,1.0,1,Very good,5.0,1,0,1,4,1-6,Thinness
1,0,20.34,0,0,1,0.0,0.0,0,1,80 or older,...,0.0,1,Very good,7.0,0,0,0,4,7-12,Normal
2,0,26.58,1,0,0,20.0,30.0,0,0,65-69,...,1.0,1,Fair,8.0,1,0,0,2,7-12,Overweight
3,0,24.21,0,0,0,0.0,0.0,0,1,75-79,...,0.0,0,Good,6.0,0,0,1,3,1-6,Normal
4,0,23.71,0,0,0,28.0,0.0,1,1,40-44,...,0.0,1,Very good,8.0,0,0,0,4,7-12,Normal


In [3]:
#convert yes/no columns to category type
columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth',
       'Asthma', 'KidneyDisease', 'SkinCancer','GenHealth_Number',
       'SleepGroups', 'BMIGroups']
for i in columns:
    health[i] = health[i].astype('category')

In [4]:
health.dtypes

HeartDisease        category
BMI                  float64
Smoking             category
AlcoholDrinking     category
Stroke              category
PhysicalHealth       float64
MentalHealth         float64
DiffWalking         category
Sex                 category
AgeCategory         category
Race                category
Diabetic            category
PhysicalActivity    category
GenHealth           category
SleepTime            float64
Asthma              category
KidneyDisease       category
SkinCancer          category
GenHealth_Number    category
SleepGroups         category
BMIGroups           category
dtype: object

In [5]:
#Create dummy or indicator features for categorical variables
dummy_df = pd.get_dummies(health[['AgeCategory','GenHealth','GenHealth_Number','SleepGroups','BMIGroups', 'Race']], drop_first = True)
df = pd.concat([health, dummy_df], axis=1) #combine the dfs
df = df.drop(['AgeCategory','GenHealth','GenHealth_Number','SleepGroups','BMIGroups','Race'], axis=1) #remove the original columns
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,...,BMIGroups_Obese,BMIGroups_Overweight,BMIGroups_Severe Thinness,BMIGroups_Severely Obese,BMIGroups_Thinness,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,16.6,1,0,0,3.0,30.0,0,1,1.0,...,0,0,0,0,1,0,0,0,0,1
1,0,20.34,0,0,1,0.0,0.0,0,1,0.0,...,0,0,0,0,0,0,0,0,0,1
2,0,26.58,1,0,0,20.0,30.0,0,0,1.0,...,0,1,0,0,0,0,0,0,0,1
3,0,24.21,0,0,0,0.0,0.0,0,1,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,1,0.0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#Standardize the magnitude of numeric features using a scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['BMI', 'PhysicalHealth','MentalHealth', 'SleepTime']].values)
scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=['BMI_scaled', 'PhysicalHealth_scaled','MentalHealth_scaled', 'SleepTime_scaled'])
scaled_df = df.drop(['BMI', 'PhysicalHealth','MentalHealth', 'SleepTime'], axis=1).join(scaled_features_df) #combine the dfs and drop unscaled columns
scaled_df.head()

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,Diabetic,PhysicalActivity,Asthma,KidneyDisease,...,BMIGroups_Thinness,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,BMI_scaled,PhysicalHealth_scaled,MentalHealth_scaled,SleepTime_scaled
0,0,1,0,0,0,1,1.0,1,1,0,...,1,0,0,0,0,1,-1.84475,-0.046751,3.281069,-1.460354
1,0,0,0,1,0,1,0.0,1,0,0,...,0,0,0,0,0,1,-1.256338,-0.42407,-0.490039,-0.067601
2,0,1,0,0,0,0,1.0,1,1,0,...,0,0,0,0,0,1,-0.274603,2.091388,3.281069,0.628776
3,0,0,0,0,0,1,0.0,0,0,0,...,0,0,0,0,0,1,-0.647473,-0.42407,-0.490039,-0.763977
4,0,0,0,0,1,1,0.0,1,0,0,...,0,0,0,0,0,1,-0.726138,3.097572,-0.490039,0.628776


In [7]:
#Split data into testing and training datasets
from sklearn.model_selection import train_test_split
X = scaled_df.drop('HeartDisease',axis=1)
y = scaled_df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=11, stratify=y)