### Getting the Dataset

In [None]:
! pip install kaggle

In [None]:
from google.colab import files
## https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
files.upload()

In [None]:
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download kamilpytlak/personal-key-indicators-of-heart-disease

In [None]:
! mkdir dataset
! unzip personal-key-indicators-of-heart-disease.zip -d dataset

### Training Model


In [6]:
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [7]:
dataCSV = pd.read_csv("dataset/heart_2020_cleaned.csv")
dataCSV.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [8]:
dataCSV['HeartDisease'] = np.where(dataCSV['HeartDisease']=='No', 0, 1)
validation = dataCSV.sample(frac=0.2, random_state=1337)
training = dataCSV.drop(validation.index)

In [9]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = df.pop('HeartDisease')
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [10]:
train_ds = df_to_dataset(dataCSV, batch_size = 128)
val_ds = df_to_dataset(validation, batch_size = 128)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [11]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [13]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [14]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [15]:
categorical_cols = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 
                    'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma',
                    'KidneyDisease', 'SkinCancer']

for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='string',
                                               max_tokens=5)
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)

In [16]:
all_features = tf.keras.layers.concatenate(encoded_features)

hiddenlayers = tf.keras.Sequential([
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

output = hiddenlayers(all_features)

model = tf.keras.Model(all_inputs, output)

In [17]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=["accuracy"])

In [18]:
tf.keras.backend.clear_session()
model.fit(train_ds, epochs=2)

Epoch 1/2


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/2


<keras.callbacks.History at 0x26a037e9790>

In [19]:
loss, accuracy = model.evaluate(val_ds)
print("Accuracy", accuracy)

Accuracy 0.9158992767333984


In [20]:
sample = {
    'BMI': 23.73,
    'Smoking': 'No',
    'AlcoholDrinking': 'No',
    'Stroke': 'No',
    'PhysicalHealth': 0.0,
    'MentalHealth': 0.0,
    'DiffWalking': 'No',
    'Sex': 'Male',
    'AgeCategory': "75-59",
    'Race': 'White',
    'Diabetic': 'No',
    'PhysicalActivity': 'Yes',
    'GenHealth': "Good",
    'SleepTime': 6.0,
    'Asthma': 'No',
    'KidneyDisease': 'No',
    'SkinCancer': 'Yes'
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])
print(prob)

tf.Tensor([0.5387992], shape=(1,), dtype=float32)


In [21]:
model.save('modeler')



INFO:tensorflow:Assets written to: modeler\assets


INFO:tensorflow:Assets written to: modeler\assets


In [22]:
reloaded_model = tf.keras.models.load_model('modeler')

In [24]:
predictions = reloaded_model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])
print(prob)

tf.Tensor([0.5387992], shape=(1,), dtype=float32)


In [8]:
import json

In [9]:
dicColumns = {}
columnData = dataCSV.copy()
columnData = columnData.drop(columns = ['HeartDisease', 'BMI'])
for col in columnData.columns:
    dicColumns[col] = np.unique(columnData[col]).tolist()

In [52]:
print(dicColumns)
with open('columns.json', 'w') as file:
    json.dump(dicColumns, file)

{'Smoking': ['No', 'Yes'], 'AlcoholDrinking': ['No', 'Yes'], 'Stroke': ['No', 'Yes'], 'PhysicalHealth': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0], 'MentalHealth': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0], 'DiffWalking': ['No', 'Yes'], 'Sex': ['Female', 'Male'], 'AgeCategory': ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older'], 'Race': ['American Indian/Alaskan Native', 'Asian', 'Black', 'Hispanic', 'Other', 'White'], 'Diabetic': ['No', 'No, borderline diabetes', 'Yes', 'Yes (during pregnancy)'], 'PhysicalActivity': ['No', 'Yes'], 'GenHealth': ['Excellent', 'Fair', 'Good', 'Poor', 'Very good'], 'SleepTime': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9

In [16]:
print(list(reversed(dicColumns['Smoking'])))

['Yes', 'No']
