In [88]:
import numpy as np
import pandas as pd
import tensorflow as tf
from imblearn.over_sampling import SMOTE

from tensorflow.keras import layers

In [89]:
tf.__version__

'2.20.0'

In [90]:
metsadult = pd.read_csv('/Users/saheed/Desktop/My_Rprog_Journey/metab_tensorflow.csv')
metsadult.drop(columns=['Unnamed: 0'], inplace=True)

In [91]:
# ACSVD risk factor encoding: Absence = 0, Presence = 1

metsadult['target'] = np.where(metsadult['metabolic_syndrome']== 'Presence', 1, 0)

print("Verifying your current encoding:")
sample_check = metsadult[['metabolic_syndrome', 'target']].head()
print(sample_check)

# Drop unused features.
dataframe = metsadult.drop(columns=['metabolic_syndrome'])

Verifying your current encoding:
  metabolic_syndrome  target
0            Absence       0
1            Absence       0
2            Absence       0
3           Presence       1
4           Presence       1


In [92]:

# Deselect non-predictive features.
X = dataframe.drop(columns=['target', 'hypercholesterolaemia', 'Gender', 'BMI'])  # Features
y = dataframe['target']

In [93]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
df_smote = pd.concat([pd.DataFrame(X_res, columns=X.columns),
                      pd.Series(y_res, name='target')], axis=1)
df_smote['target'].value_counts()
print(df_smote.shape)
print(df_smote.head())
dataframe_smoted = df_smote

(1710, 22)
   LBDAPBSI  BMXBMI  BMXWAIST  Systolic_BP  Diastolic_BP  RIDAGEYR  RIAGENDR  \
0      1.29    30.8     107.9   140.000000     86.000000        53         1   
1      0.72    28.0      86.6   111.333333     72.666667        22         1   
2      0.92    24.1      90.1   110.666667     72.000000        45         1   
3      1.24    35.4     113.5   142.666667     62.666667        57         2   
4      1.29    25.3      79.5   107.333333     61.333333        24         2   

   DXXSATA  DXXSATM  DXXVFATA  ...  LBDGLUSI  LBDHDDSI  LBXHSCRP  LBDINSI  \
0   260.01  1253.60    200.60  ...      5.59      1.63       1.4   103.56   
1   264.63  1275.84     65.87  ...      5.27      1.24       1.3    68.34   
2   162.77   784.75     67.03  ...      4.68      1.29       0.3    17.16   
3   513.64  2476.40    209.81  ...     22.10      1.11       3.9    33.42   
4   300.20  1447.35     80.53  ...      5.27      1.06       1.7    79.38   

   LBDTCSI  LBDTRSI  LBDLDLSI  eLDL_Trig  Fas

In [94]:
## Split the data into training, validation, and test sets (80% train, 10% val, 10% test)
train, val, test = np.split(dataframe_smoted.sample(frac=1), [int(0.8*len(dataframe_smoted)), int(0.9*len(dataframe_smoted))])

  return bound(*args, **kwds)


In [95]:
# Verify the sizes of each set
print(len(train), 'training samples')
print(len(val), 'validation samples')
print(len(test), 'test samples')

1368 training samples
171 validation samples
171 test samples


In [96]:
# Create TensorFlow datasets; batch size of 32, shuffle the training data, and prefetch for performance
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe_smoted.copy()
  labels = df.pop('target')
  df = {key: value.to_numpy()[:,tf.newaxis] for key, value in dataframe_smoted.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [97]:
# Create the training dataset, 
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

In [98]:
# Inspect a batch of the training dataset
train_ds.take(1)

<_TakeDataset element_spec=({'LBDAPBSI': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'BMXBMI': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'BMXWAIST': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'Systolic_BP': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'Diastolic_BP': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'RIDAGEYR': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'RIAGENDR': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'DXXSATA': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'DXXSATM': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'DXXVFATA': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'DXXVFATM': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'LBXGH': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'LBDGLUSI': TensorSpec(shape=(None, 1), dtype=tf.float64, name=None), 'LBDHDDSI': TensorSpec(shape=(None, 1), dtype=tf.float64, name=No

In [99]:
# Display the features contained in the batch
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))

Every feature: ['LBDAPBSI', 'BMXBMI', 'BMXWAIST', 'Systolic_BP', 'Diastolic_BP', 'RIDAGEYR', 'RIAGENDR', 'DXXSATA', 'DXXSATM', 'DXXVFATA', 'DXXVFATM', 'LBXGH', 'LBDGLUSI', 'LBDHDDSI', 'LBXHSCRP', 'LBDINSI', 'LBDTCSI', 'LBDTRSI', 'LBDLDLSI', 'eLDL_Trig', 'Fasting_hrs', 'target']


In [100]:
# Display the first feature and label batch
print('A batch of targets:', label_batch )

A batch of targets: tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [101]:
# Normalize numerical features such as Age, Systolic_BP, Diastolic_BP, Fasting_Blood_Sugar, Triglycerides, HDL_Cholesterol, Waist_Circumference. 
# Because these features have different ranges, normalizing them helps the model learn more effectively.
def get_normalization_layer(name, dataset):
    normalizer = layers.Normalization(axis=None)
    feature_ds = dataset.map(lambda x, y: x[name])
    normalizer.adapt(feature_ds)
    return normalizer

In [102]:
# List of numeric features to be normalized
numeric_features = ['LBDAPBSI', 'BMXBMI', 'BMXWAIST', 'Systolic_BP', 'Diastolic_BP', 
                   'RIDAGEYR', 'RIAGENDR', 'DXXSATA', 'DXXSATM', 'DXXVFATA', 
                   'DXXVFATM', 'LBXGH', 'LBDGLUSI', 'LBDHDDSI', 'LBXHSCRP', 
                   'LBDINSI', 'LBDTCSI', 'LBDTRSI', 'LBDLDLSI', 'eLDL_Trig', 
                   'Fasting_hrs']

# Create input layers and normalization layers for each numeric feature
all_inputs = {}
encoded_features = []

# then for each numeric feature, create an input layer and a normalization layer
for header in numeric_features:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs[header] = numeric_col
    encoded_features.append(encoded_numeric_col)

In [103]:
## Example usage of the normalization layer
bmi_count_col = train_features['BMXBMI']
layer = get_normalization_layer('BMXBMI', train_ds)
layer(bmi_count_col)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[ 0.07841249],
       [-0.7357946 ],
       [ 0.26090738],
       [-0.66560435],
       [-0.83406115]], dtype=float32)>

In [104]:

# Create a category encoding layer for categorical features

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [105]:

# Prepare the validation and test datasets.
batch_size = 256

# Create the training dataset, shuffle the training data to ensure randomness during training and 
# ensure performance with prefetching and batching to the specified batch size

train_ds = df_to_dataset(train, batch_size=batch_size)

# shuffle to ensure randomness during training but not during validation and testing
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [106]:
## Create input layers and normalization layers for each numeric feature
all_inputs = {}

encoded_features = []

# then for each numeric feature, create an input layer and a normalization layer
for header in ['LBDAPBSI', 'BMXBMI', 'BMXWAIST', 'Systolic_BP', 'Diastolic_BP', 'RIDAGEYR', 'RIAGENDR', 'DXXSATA',
       'DXXSATM', 'DXXVFATA', 'DXXVFATM', 'LBXGH', 'LBDGLUSI', 'LBDHDDSI', 'LBXHSCRP', 'LBDINSI', 'LBDTCSI',
       'LBDTRSI', 'LBDLDLSI', 'eLDL_Trig', 'Fasting_hrs']:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs[header] = numeric_col
  encoded_features.append(encoded_numeric_col)

In [107]:
## Build the model using the Functional API
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [108]:
# Compile the model with Adam optimizer for binary classification, using binary cross-entropy loss  for logits,
# and tracking accuracy and AUC as metrics.
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy", tf.keras.metrics.AUC(name='auc')],
              run_eagerly=True)

In [109]:
# in this case, we'll train for 100 epochs and use early stopping to prevent overfitting.
# We'll monitor the validation loss and stop training if it doesn't improve for 10 consecutive epochs.
# the parameters below include model.fit parameters such as epochs, validation data, and callbacks for early stopping.
import pydot 
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, rankdir="LR")

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [110]:
# Train the model, using the training dataset and validating on the validation dataset.
model.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
[1m3/7[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m0s[0m 31ms/step - accuracy: 0.6233 - auc: 0.6007 - loss: 0.6987 

Expected: {'LBDAPBSI': 'LBDAPBSI', 'BMXBMI': 'BMXBMI', 'BMXWAIST': 'BMXWAIST', 'Systolic_BP': 'Systolic_BP', 'Diastolic_BP': 'Diastolic_BP', 'RIDAGEYR': 'RIDAGEYR', 'RIAGENDR': 'RIAGENDR', 'DXXSATA': 'DXXSATA', 'DXXSATM': 'DXXSATM', 'DXXVFATA': 'DXXVFATA', 'DXXVFATM': 'DXXVFATM', 'LBXGH': 'LBXGH', 'LBDGLUSI': 'LBDGLUSI', 'LBDHDDSI': 'LBDHDDSI', 'LBXHSCRP': 'LBXHSCRP', 'LBDINSI': 'LBDINSI', 'LBDTCSI': 'LBDTCSI', 'LBDTRSI': 'LBDTRSI', 'LBDLDLSI': 'LBDLDLSI', 'eLDL_Trig': 'eLDL_Trig', 'Fasting_hrs': 'Fasting_hrs'}
Received: inputs={'LBDAPBSI': 'Tensor(shape=(256, 1))', 'BMXBMI': 'Tensor(shape=(256, 1))', 'BMXWAIST': 'Tensor(shape=(256, 1))', 'Systolic_BP': 'Tensor(shape=(256, 1))', 'Diastolic_BP': 'Tensor(shape=(256, 1))', 'RIDAGEYR': 'Tensor(shape=(256, 1))', 'RIAGENDR': 'Tensor(shape=(256, 1))', 'DXXSATA': 'Tensor(shape=(256, 1))', 'DXXSATM': 'Tensor(shape=(256, 1))', 'DXXVFATA': 'Tensor(shape=(256, 1))', 'DXXVFATM': 'Tensor(shape=(256, 1))', 'LBXGH': 'Tensor(shape=(256, 1))', 'LBDGLUSI

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5988 - auc: 0.6249 - loss: 0.7002 - val_accuracy: 0.6462 - val_auc: 0.7212 - val_loss: 0.5742
Epoch 2/10


Expected: {'LBDAPBSI': 'LBDAPBSI', 'BMXBMI': 'BMXBMI', 'BMXWAIST': 'BMXWAIST', 'Systolic_BP': 'Systolic_BP', 'Diastolic_BP': 'Diastolic_BP', 'RIDAGEYR': 'RIDAGEYR', 'RIAGENDR': 'RIAGENDR', 'DXXSATA': 'DXXSATA', 'DXXSATM': 'DXXSATM', 'DXXVFATA': 'DXXVFATA', 'DXXVFATM': 'DXXVFATM', 'LBXGH': 'LBXGH', 'LBDGLUSI': 'LBDGLUSI', 'LBDHDDSI': 'LBDHDDSI', 'LBXHSCRP': 'LBXHSCRP', 'LBDINSI': 'LBDINSI', 'LBDTCSI': 'LBDTCSI', 'LBDTRSI': 'LBDTRSI', 'LBDLDLSI': 'LBDLDLSI', 'eLDL_Trig': 'eLDL_Trig', 'Fasting_hrs': 'Fasting_hrs'}
Received: inputs={'LBDAPBSI': 'Tensor(shape=(174, 1))', 'BMXBMI': 'Tensor(shape=(174, 1))', 'BMXWAIST': 'Tensor(shape=(174, 1))', 'Systolic_BP': 'Tensor(shape=(174, 1))', 'Diastolic_BP': 'Tensor(shape=(174, 1))', 'RIDAGEYR': 'Tensor(shape=(174, 1))', 'RIAGENDR': 'Tensor(shape=(174, 1))', 'DXXSATA': 'Tensor(shape=(174, 1))', 'DXXSATM': 'Tensor(shape=(174, 1))', 'DXXVFATA': 'Tensor(shape=(174, 1))', 'DXXVFATM': 'Tensor(shape=(174, 1))', 'LBXGH': 'Tensor(shape=(174, 1))', 'LBDGLUSI

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.6275 - auc: 0.6713 - loss: 0.6423 - val_accuracy: 0.6842 - val_auc: 0.7688 - val_loss: 0.5334
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.6696 - auc: 0.7276 - loss: 0.5857 - val_accuracy: 0.7158 - val_auc: 0.7962 - val_loss: 0.5043
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.6982 - auc: 0.7480 - loss: 0.5525 - val_accuracy: 0.7368 - val_auc: 0.8062 - val_loss: 0.4830
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.7099 - auc: 0.7536 - loss: 0.5500 - val_accuracy: 0.7579 - val_auc: 0.8184 - val_loss: 0.4661
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.7339 - auc: 0.7747 - loss: 0.5360 - val_accuracy: 0.7725 - val_auc: 0.8268 - val_loss: 0.4518
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x305bd8ed0>

In [111]:
result = model.evaluate(test_ds, return_dict=True)
print(result)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.8035 - auc: 0.8567 - loss: 0.4092
{'accuracy': 0.8035087585449219, 'auc': 0.8567060232162476, 'loss': 0.4092298746109009}


In [112]:
model.save('my_mets_classifier.keras')
reloaded_model = tf.keras.models.load_model('my_mets_classifier.keras')

In [113]:
sample = {
    'LBDAPBSI': 1.65,
    'BMXBMI': 25.0,
    'BMXWAIST': 103.6,
    'Systolic_BP': 120,
    'Diastolic_BP': 80,
    'RIDAGEYR': 20,
    'RIAGENDR': 1,  # Female (typically 2=female, 1=male in medical datasets)
    'DXXSATA': 159.73,
    'DXXSATM': 800.18,
    'DXXVFATA': 49.66,
    'DXXVFATM': 239.42,
    'LBXGH': 7.2,
    'LBDGLUSI': 5.0,
    'LBDHDDSI': 1.4,
    'LBXHSCRP': 1.1,
    'LBDINSI': 200.82,
    'LBDTCSI': 3.78,
    'LBDTRSI': 2.0,
    'LBDLDLSI': 2.069,
    'eLDL_Trig': 0.265,
    'Fasting_hrs': 11.75
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = reloaded_model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])

print(
    "This particular patient had a %.1f percent probability "
    "of having metabolic syndrome." % (100 * prob)
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
This particular patient had a 92.4 percent probability of having metabolic syndrome.
