In [10]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [11]:
# Read in the cardiovascular dataset from Google Sheets
df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv")
df.head()

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,height_cm,weight_kg,bmi,smoking_history,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


# Data Preprocessing

In [12]:
# Define categorical columns for encoding and numeric columns for scaling
categorical_cols = ['general_health', 'checkup', 'exercise', 'skin_cancer', 'other_cancer', 'depression',
                    'diabetes', 'arthritis', 'sex', 'age_category', 'smoking_history']
numeric_cols = ['height_cm', 'weight_kg', 'bmi', 'alcohol_consumption',
                'fruit_consumption', 'green_vegetables_consumption', 'friedpotato_consumption']

In [13]:
# Encode categorical columns using get_dummies
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df.head()

Unnamed: 0,heart_disease,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Fair,general_health_Good,...,age_category_40-44,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_Yes
0,No,150,32.66,14.54,0,30,16,12,0,0,...,0,0,0,0,0,0,1,0,0,1
1,Yes,165,77.11,28.29,0,30,0,4,0,0,...,0,0,0,0,0,0,1,0,0,0
2,No,163,88.45,33.47,4,12,3,16,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Yes,180,93.44,28.73,0,30,30,8,0,0,...,0,0,0,0,0,0,0,1,0,0
4,No,191,88.45,24.37,0,8,4,0,0,1,...,0,0,0,0,0,0,0,0,1,1


In [14]:
# Scale numerical columns using StandardScaler
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df.head()

Unnamed: 0,heart_disease,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Fair,general_health_Good,...,age_category_40-44,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_Yes
0,No,-1.945527,-2.388688,-2.162285,-0.624388,0.005697,0.0596,0.662925,0,0,...,0,0,0,0,0,0,1,0,0,1
1,Yes,-0.537041,-0.299715,-0.040788,-0.624388,0.005697,-1.012964,-0.268558,0,0,...,0,0,0,0,0,0,1,0,0,0
2,No,-0.724839,0.23322,0.758438,-0.137925,-0.717605,-0.811858,1.128667,0,0,...,0,0,0,0,1,0,0,0,0,0
3,Yes,0.871445,0.46773,0.0271,-0.624388,0.005697,0.998094,0.197184,0,0,...,0,0,0,0,0,0,0,1,0,0
4,No,1.904335,0.23322,-0.645607,-0.624388,-0.878338,-0.744823,-0.734299,0,1,...,0,0,0,0,0,0,0,0,1,1


In [15]:
# Assign the target variable 'heart_disease' to y
y = df['heart_disease']

# Assign the remaining columns (features) to X
X = df.drop(columns=['heart_disease'])

In [16]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)