<a href="https://colab.research.google.com/github/amitrawat158/StructuredDataClassificationWithFeatureSpace/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow



In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras

In [None]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
dataframe = pd.read_csv(file_url)

In [None]:
print(dataframe.shape)

(303, 14)


In [None]:
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [None]:
val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
train_dataframe = dataframe.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    %(len(train_dataframe), len(val_dataframe))
    )


Using 242 samples for training and 61 for validation


In [None]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

In [None]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'age': <tf.Tensor: shape=(), dtype=int64, numpy=29>, 'sex': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'cp': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'trestbps': <tf.Tensor: shape=(), dtype=int64, numpy=130>, 'chol': <tf.Tensor: shape=(), dtype=int64, numpy=204>, 'fbs': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'restecg': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'thalach': <tf.Tensor: shape=(), dtype=int64, numpy=202>, 'exang': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'oldpeak': <tf.Tensor: shape=(), dtype=float64, numpy=0.0>, 'slope': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'ca': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'thal': <tf.Tensor: shape=(), dtype=string, numpy=b'normal'>}
Target: tf.Tensor(0, shape=(), dtype=int64)


In [None]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [None]:
from keras.utils import FeatureSpace

feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "sex": "integer_categorical",
        "cp": "integer_categorical",
        "fbs": "integer_categorical",
        "restecg": "integer_categorical",
        "exang": "integer_categorical",
        "ca": "integer_categorical",
        # Categorical feature encoded as string
        "thal": "string_categorical",
        # Numerical features to discretize
        "age": "float_discretized",
        # Numerical features to normalize
        "trestbps": "float_normalized",
        "chol": "float_normalized",
        "thalach": "float_normalized",
        "oldpeak": "float_normalized",
        "slope": "float_normalized",
    },
    # We create additional features by hashing
    # value co-occurrences for the
    # following groups of categorical features.
    crosses=[("sex", "age"), ("thal", "ca")],
    # The hashing space for these co-occurrences
    # wil be 32-dimensional.
    crossing_dim=32,
    # Our utility will one-hot encode all categorical
    # features and concat all features into a single
    # vector (one vector per sample).
    output_mode="concat",
)


In [None]:
feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "sex": FeatureSpace.integer_categorical(num_oov_indices=0),
        "cp": FeatureSpace.integer_categorical(num_oov_indices=0),
        "fbs": FeatureSpace.integer_categorical(num_oov_indices=0),
        "restecg": FeatureSpace.integer_categorical(num_oov_indices=0),
        "exang": FeatureSpace.integer_categorical(num_oov_indices=0),
        "ca": FeatureSpace.integer_categorical(num_oov_indices=0),
        # Categorical feature encoded as string
        "thal": FeatureSpace.string_categorical(num_oov_indices=0),
        # Numerical features to normalize
        "age": FeatureSpace.float_discretized(num_bins=30),
        # Numerical features to normalize
        "trestbps": FeatureSpace.float_normalized(),
        "chol": FeatureSpace.float_normalized(),
        "thalach": FeatureSpace.float_normalized(),
        "oldpeak": FeatureSpace.float_normalized(),
        "slope": FeatureSpace.float_normalized(),
    },
    # Specify feature cross with a custom crossing dim.
    crosses=[
        FeatureSpace.cross(feature_names=("sex", "age"), crossing_dim=64),
        FeatureSpace.cross(
            feature_names=("thal", "ca"),
            crossing_dim=16,
        ),
    ],
    output_mode="concat",
)

In [None]:
train_ds_with_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_with_no_labels)

In [None]:
for x, _ in train_ds.take(1):
    preprocessed_x = feature_space(x)
    print("preprocessed_x.shape:", preprocessed_x.shape)
    print("preprocessed_x.dtype:", preprocessed_x.dtype)

preprocessed_x.shape: (32, 138)
preprocessed_x.dtype: <dtype: 'float32'>


In [None]:
preprocessed_train_ds = train_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)

preprocessed_val_ds = val_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)

In [None]:
dict_inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

x = keras.layers.Dense(32, activation="relu")(encoded_features)
x = keras.layers.Dropout(0.5)(x)
predictions = keras.layers.Dense(1, activation="sigmoid")(x)

training_model = keras.Model(inputs=encoded_features, outputs=predictions)
training_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)

inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)

In [None]:
training_model.fit(
    preprocessed_train_ds, epochs=20, validation_data=preprocessed_val_ds, verbose=2
)

Epoch 1/20
8/8 - 1s - loss: 0.6604 - accuracy: 0.6405 - val_loss: 0.5970 - val_accuracy: 0.7705 - 1s/epoch - 138ms/step
Epoch 2/20
8/8 - 0s - loss: 0.5908 - accuracy: 0.7231 - val_loss: 0.5529 - val_accuracy: 0.7705 - 307ms/epoch - 38ms/step
Epoch 3/20
8/8 - 0s - loss: 0.5535 - accuracy: 0.7645 - val_loss: 0.5161 - val_accuracy: 0.7869 - 297ms/epoch - 37ms/step
Epoch 4/20
8/8 - 0s - loss: 0.5345 - accuracy: 0.7810 - val_loss: 0.4870 - val_accuracy: 0.8033 - 227ms/epoch - 28ms/step
Epoch 5/20
8/8 - 0s - loss: 0.4978 - accuracy: 0.7727 - val_loss: 0.4632 - val_accuracy: 0.8033 - 193ms/epoch - 24ms/step
Epoch 6/20
8/8 - 0s - loss: 0.4653 - accuracy: 0.7851 - val_loss: 0.4437 - val_accuracy: 0.8033 - 202ms/epoch - 25ms/step
Epoch 7/20
8/8 - 0s - loss: 0.4406 - accuracy: 0.8140 - val_loss: 0.4267 - val_accuracy: 0.8033 - 191ms/epoch - 24ms/step
Epoch 8/20
8/8 - 0s - loss: 0.4220 - accuracy: 0.8430 - val_loss: 0.4137 - val_accuracy: 0.8033 - 204ms/epoch - 26ms/step
Epoch 9/20
8/8 - 0s - loss

<keras.src.callbacks.History at 0x7d9fc4326a70>

In [None]:
sample = {
    "age": 60,
    "sex": 1,
    "cp": 1,
    "trestbps": 145,
    "chol": 233,
    "fbs": 1,
    "restecg": 2,
    "thalach": 150,
    "exang": 0,
    "oldpeak": 2.3,
    "slope": 3,
    "ca": 0,
    "thal": "fixed",
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = inference_model.predict(input_dict)

print(
    f"This particular patient had a {100 * predictions[0][0]:.2f}% probability "
    "of having a heart disease, as evaluated by our model."
)

This particular patient had a 54.70% probability of having a heart disease, as evaluated by our model.


In [None]:
patients = [
    {"age": 60, "sex": 1, "cp": 1, "trestbps": 145, "chol": 233, "fbs": 1, "restecg": 2, "thalach": 150, "exang": 0, "oldpeak": 2.3, "slope": 3, "ca": 0, "thal": "fixed"},
    {"age": 67, "sex": 1, "cp": 4, "trestbps": 160, "chol": 286, "fbs": 0, "restecg": 2, "thalach": 108, "exang": 1.5, "oldpeak": 2.3, "slope": 2, "ca": 3, "thal": "normal"},
     {"age": 67, "sex": 1, "cp": 4, "trestbps": 120, "chol": 229, "fbs": 0, "restecg": 2, "thalach": 129, "exang": 1, "oldpeak": 2.6, "slope": 2, "ca": 2, "thal": "reversible"}
    # Add more patient feature dictionaries as needed
]


In [None]:
# Create a dictionary where each key corresponds to a feature name
# and the values are lists of feature values for all patients
input_dict = {}
for name in patients[0].keys():
    input_dict[name] = [patient[name] for patient in patients]

# Convert the feature values to TensorFlow tensors
input_tensors = {name: tf.convert_to_tensor(values) for name, values in input_dict.items()}

# Make predictions for all patients
predictions = inference_model.predict(input_tensors)

# Print the probabilities for each patient
for i, patient in enumerate(patients):
    probability = 100 * predictions[i][0]
    print(f"Patient {i + 1}: Probability of having heart disease: {probability:.2f}%")


Patient 1: Probability of having heart disease: 54.70%
Patient 2: Probability of having heart disease: 73.42%
Patient 3: Probability of having heart disease: 89.78%
