<a href="https://colab.research.google.com/github/ammad19/AI-course-exercises/blob/master/Demo_of_FeatureSpace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A new utility to make structured data preprocessing easier: `FeatureSpace`

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

## Get datasets

Each dataset yields a tuple `(sample, label)` where `sample` is a dict of scalar
values (could be strings, ints, floats...)

In [None]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
dataframe = pd.read_csv(file_url)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [None]:
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = dataframe.drop(val_dataframe.index)

def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

train_ds_no_labels = train_ds.map(lambda x, y: x)

## Set up preprocessing / encoding

In [None]:
from keras.utils import FeatureSpace

feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "sex": "integer_categorical",
        "cp": "integer_categorical",
        "fbs": "integer_categorical",
        "restecg": "integer_categorical",
        "exang": "integer_categorical",
        "ca": "integer_categorical",

        # Categorical feature encoded as string
        "thal": "string_categorical",

        # Numerical features to discretize
        "age": "float_discretized",

        # Numerical features to normalize
        "trestbps": "float_normalized",
        "chol": "float_normalized",
        "thalach": "float_normalized",
        "oldpeak": "float_normalized",

        # Numerical features to keep unchanged
        "slope": "float",
    },
    crosses=[("sex", "age"), ("thal", "ca")],
    crossing_dim=1024,
    output_mode="concat",
)

In [None]:
feature_space.adapt(train_ds_no_labels)

...Adapt sex
...Adapt cp
...Adapt fbs
...Adapt restecg
...Adapt exang
...Adapt ca
...Adapt thal
...Adapt age
...Adapt trestbps
...Adapt chol
...Adapt thalach
...Adapt oldpeak


In [None]:
feature_space.get_inputs()

{'sex': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'sex')>,
 'cp': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'cp')>,
 'fbs': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'fbs')>,
 'restecg': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'restecg')>,
 'exang': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'exang')>,
 'ca': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'ca')>,
 'thal': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'thal')>,
 'age': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 'trestbps': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'trestbps')>,
 'chol': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'chol')>,
 'thalach': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'thalach')>,
 'oldpeak': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'oldpeak')>,
 'slope': <KerasTensor: sha

In [None]:
feature_space.get_encoded_features()

<KerasTensor: shape=(None, 2115) dtype=float32 (created by layer 'concatenate_24')>

In [None]:
# BATCHED CALL
for x in train_ds_no_labels.batch(32).take(1):
  pass
y = feature_space(x)
print(y.shape)

# UNBATCHED CALL
for x in train_ds_no_labels.take(1):
  pass
y = feature_space(x)
print(y.shape)

(32, 2115)
(2115,)


## Build and train model (sync preprocessing)

In [None]:
inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

outputs = layers.Dense(1, activation="sigmoid")(encoded_features)

model = keras.Model(inputs, outputs)

model.compile(optimizer="adam", loss="binary_crossentropy")
preds = model.fit(train_ds.batch(32), epochs=10)

preds = model.predict(val_ds.batch(32))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Build and train model (async preprocessing in tf.data)

In [None]:
inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

outputs = layers.Dense(1, activation="sigmoid")(encoded_features)

training_model = keras.Model(encoded_features, outputs)
training_model.compile(optimizer="adam", loss="binary_crossentropy")
inference_model = keras.Model(inputs, outputs)

preprocessed_train_ds = train_ds.map(lambda x, y: (feature_space(x), y))
training_model.fit(preprocessed_train_ds.batch(32), epochs=2)

preds = inference_model.predict(val_ds.batch(32))

<MapDataset element_spec=(TensorSpec(shape=(131,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>
Epoch 1/2
Epoch 2/2


## Demo of `output_mode=None` (return dict of encoded features)

In [None]:
feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "sex": "integer_categorical",
        "cp": "integer_categorical",
        "fbs": "integer_categorical",
        "restecg": "integer_categorical",
        "exang": "integer_categorical",
        "ca": "integer_categorical",

        # Categorical feature encoded as string
        "thal": "string_categorical",

        # Numerical features to discretize
        "age": "float_discretized",

        # Numerical features to normalize
        "trestbps": "float_normalized",
        "chol": "float_normalized",
        "thalach": "float_normalized",
        "oldpeak": "float_normalized",

        # Numerical features to keep unchanged
        "slope": "float",
    },
    crosses=[("sex", "age"), ("thal", "ca")],  # Only allow Xs of cat (int) outputs
    crossing_dim=1024,
    output_mode=None,
)

In [None]:
feature_space.adapt(train_ds_no_labels)

...Adapt sex
...Adapt cp
...Adapt fbs
...Adapt restecg
...Adapt exang
...Adapt ca
...Adapt thal
...Adapt age
...Adapt trestbps
...Adapt chol
...Adapt thalach
...Adapt oldpeak


In [None]:
feature_space.get_encoded_features()

{'sex': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'sex_preprocessor')>,
 'cp': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'cp_preprocessor')>,
 'fbs': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'fbs_preprocessor')>,
 'restecg': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'restecg_preprocessor')>,
 'exang': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'exang_preprocessor')>,
 'ca': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'ca_preprocessor')>,
 'thal': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'thal_preprocessor')>,
 'age': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'age_preprocessor')>,
 'trestbps': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'trestbps_preprocessor')>,
 'chol': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'chol_preprocessor')>,
 'thalach': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 

In [None]:
# BATCHED CALL
for x in train_ds_no_labels.batch(32).take(1):
  pass
y = feature_space(x)
print(y["trestbps"].shape)

# UNBATCHED CALL
for x in train_ds_no_labels.take(1):
  pass
y = feature_space(x)
print(y["trestbps"].shape)

(32, 1)
(1,)


## Demo of lower-level usage mode: configure each feature


Lower-level variant also possible in case you need further configuration of each encoding step:

In [None]:
feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "sex": FeatureSpace.integer_categorical(),
        "cp": FeatureSpace.integer_categorical(),
        "fbs": FeatureSpace.integer_categorical(),
        "restecg": FeatureSpace.integer_categorical(),
        "exang": FeatureSpace.integer_categorical(),
        "ca": FeatureSpace.integer_categorical(),

        # Categorical feature encoded as string
        "thal": FeatureSpace.string_categorical(max_tokens=10),

        # Numerical features to discretize
        "age": FeatureSpace.float_discretized(num_bins=32),

        # Numerical features to normalize
        "trestbps": FeatureSpace.float_normalized(),
        "chol": FeatureSpace.float_normalized(),
        "thalach": FeatureSpace.float_normalized(),
        "oldpeak": FeatureSpace.float_normalized(),

        # Numerical features to keep unchanged
        "slope": FeatureSpace.float(),
    },
    crosses=[FeatureSpace.cross(("sex", "age"), crossing_dim=32), FeatureSpace.cross(("thal", "ca"), crossing_dim=32)],
    output_mode="concat",  # one_hot_concat, None
)

In [None]:
feature_space.adapt(train_ds_no_labels)

...Adapt sex
...Adapt cp
...Adapt fbs
...Adapt restecg
...Adapt exang
...Adapt ca
...Adapt thal
...Adapt age
...Adapt trestbps
...Adapt chol
...Adapt thalach
...Adapt oldpeak


In [None]:
feature_space.get_inputs()

{'sex': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'sex')>,
 'cp': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'cp')>,
 'fbs': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'fbs')>,
 'restecg': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'restecg')>,
 'exang': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'exang')>,
 'ca': <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'ca')>,
 'thal': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'thal')>,
 'age': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 'trestbps': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'trestbps')>,
 'chol': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'chol')>,
 'thalach': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'thalach')>,
 'oldpeak': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'oldpeak')>,
 'slope': <KerasTensor: sha

In [None]:
feature_space.get_encoded_features()

<KerasTensor: shape=(None, 131) dtype=float32 (created by layer 'concatenate_25')>