In [29]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [30]:
heart_data = pd.read_csv('data_cleaned.csv')

In [31]:
heart_data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0.0,16.6,1.0,0.0,0.0,3.0,30.0,0.0,0.0,7.0,5.0,2.0,1.0,4.0,5.0,1.0,0.0,1.0
1,0.0,20.34,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,5.0,0.0,1.0,4.0,7.0,0.0,0.0,0.0
2,0.0,26.58,1.0,0.0,0.0,20.0,30.0,0.0,1.0,9.0,5.0,2.0,1.0,1.0,8.0,1.0,0.0,0.0
3,0.0,24.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,5.0,0.0,0.0,2.0,6.0,0.0,0.0,1.0
4,0.0,23.71,0.0,0.0,0.0,28.0,0.0,1.0,0.0,4.0,5.0,0.0,1.0,4.0,8.0,0.0,0.0,0.0


Since the variables PhysicalHealth and MentalHealth follows bimodal distribution, we can transform them into categorical variables.

In [32]:
# Define the bar value
bar_value = 10

# Create a categorical column based on the bar value
heart_data['PhysicalHealth'] = pd.cut(heart_data['PhysicalHealth'], bins=[-1, bar_value, 30], labels=[0, 1])
heart_data['MentalHealth'] = pd.cut(heart_data['MentalHealth'], bins=[-1, bar_value, 30], labels=[0, 1])

heart_data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0.0,16.6,1.0,0.0,0.0,0,1,0.0,0.0,7.0,5.0,2.0,1.0,4.0,5.0,1.0,0.0,1.0
1,0.0,20.34,0.0,0.0,1.0,0,0,0.0,0.0,12.0,5.0,0.0,1.0,4.0,7.0,0.0,0.0,0.0
2,0.0,26.58,1.0,0.0,0.0,1,1,0.0,1.0,9.0,5.0,2.0,1.0,1.0,8.0,1.0,0.0,0.0
3,0.0,24.21,0.0,0.0,0.0,0,0,0.0,0.0,11.0,5.0,0.0,0.0,2.0,6.0,0.0,0.0,1.0
4,0.0,23.71,0.0,0.0,0.0,1,0,1.0,0.0,4.0,5.0,0.0,1.0,4.0,8.0,0.0,0.0,0.0


Dummy encoding is a process of converting categorical variables into numerical data that can be used in machine learning algorithms. In this process, we create new columns for each category in the original column and assign a value of 1 or 0 to indicate whether a particular category is present or absent in each row.

In [33]:
#Dummies variables
categorical_features=['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth','Asthma', 'KidneyDisease', 'SkinCancer','PhysicalHealth','MentalHealth']

heart_data_encode = pd.get_dummies(heart_data[categorical_features].astype(str), prefix=categorical_features, drop_first=True)

In [34]:
#Add the numerical variables BMI and SleepTime
heart_data_encode['BMI'] = heart_data['BMI']
heart_data_encode['SleepTime'] = heart_data['SleepTime']

In [35]:
heart_data_encode.head()

Unnamed: 0,HeartDisease_1.0,Smoking_1.0,AlcoholDrinking_1.0,Stroke_1.0,DiffWalking_1.0,Sex_1.0,AgeCategory_1.0,AgeCategory_10.0,AgeCategory_11.0,AgeCategory_12.0,...,GenHealth_2.0,GenHealth_3.0,GenHealth_4.0,Asthma_1.0,KidneyDisease_1.0,SkinCancer_1.0,PhysicalHealth_1,MentalHealth_1,BMI,SleepTime
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,1,16.6,5.0
1,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,20.34,7.0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,1,26.58,8.0
3,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,24.21,6.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,23.71,8.0


In [49]:
heart_data_encode = heart_data_encode.rename(columns={'HeartDisease_1.0': 'HeartDisease'})
heart_data_encode.head()

Unnamed: 0,HeartDisease,Smoking_1.0,AlcoholDrinking_1.0,Stroke_1.0,DiffWalking_1.0,Sex_1.0,AgeCategory_1.0,AgeCategory_10.0,AgeCategory_11.0,AgeCategory_12.0,...,GenHealth_2.0,GenHealth_3.0,GenHealth_4.0,Asthma_1.0,KidneyDisease_1.0,SkinCancer_1.0,PhysicalHealth_1,MentalHealth_1,BMI,SleepTime
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,1,16.6,5.0
1,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,20.34,7.0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,1,26.58,8.0
3,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,24.21,6.0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,23.71,8.0


Next, we randomly split our data set into training set(80%) and testing set(20%). Since our heartdisease prediction value is highly imbalanced. We use Smoke to handle the imbalanced data. SMOTE is an oversampling technique where the synthetic samples are generated for the minority class. This algorithm helps to overcome the overfitting problem posed by random oversampling. It focuses on the feature space to generate new instances with the help of interpolation between the positive instances that lie together.

In [56]:
X = heart_data_encode.drop('HeartDisease',axis=1)
y = heart_data_encode.HeartDisease

#Split our data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=503)

#Standardize the data
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

# apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [65]:
train_df = pd.concat([X_resampled, y_resampled],axis=1)
test_df = pd.concat([X_test, y_test],axis=1)
test_df.index = range(63959)

train_df.to_csv('heart_data_train.csv')
test_df.to_csv('heart_data_test.csv')

In [67]:
#Run this before train the model
#X_train = pd.read_csv('heart_data_train.csv').drop('HeartDisease',axis=1).iloc[: , 1:]
#y_train = pd.read_csv('heart_data_train.csv').HeartDisease
#X_test = pd.read_csv('heart_data_test.csv').drop('HeartDisease',axis=1).iloc[: , 1:]
#y_test = pd.read_csv('heart_data_test.csv').HeartDisease