# Classifying Structure Data using Keras Preprocessing Layers

Learning Objectives:
1. Load a CSV file using pandas
2. Build an input pipeline to batch and shuffle the rows using tf.data.
3. Map from columns in the CSV to features used to train the model using Keras Preprosessing layers.
4. Build, train, and evaluate a model using Keras.


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

In [2]:
tf.__version__

'2.12.0'

In [3]:
import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'

csv_file = 'petfinder-mini.csv'

tf.keras.utils.get_file('petfinder-mini.zip', dataset_url, extract=True, cache_dir='.')

dataframe = pd.read_csv(csv_file)

In [4]:
dataframe.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


In [5]:
dataframe.describe()

Unnamed: 0,Age,Fee,PhotoAmt,AdoptionSpeed
count,11537.0,11537.0,11537.0,11537.0
mean,11.743434,23.957268,3.610211,2.486522
std,19.324221,80.024226,3.145872,1.173275
min,0.0,0.0,0.0,0.0
25%,2.0,0.0,2.0,2.0
50%,4.0,0.0,3.0,2.0
75%,12.0,0.0,5.0,4.0
max,255.0,2000.0,30.0,4.0


In [6]:
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4,0,1)

dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])

In [7]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')


7383 train examples
1846 validation examples
2308 test examples


### Create an input pipeline using tf.data

In this case the data fits into the memory, no need to read from disk.

In [8]:
def df_to_dataset(dataframe, shuffle=True, batch_size=3):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)

    return ds

In [9]:
batch_size = 5

train_ds = df_to_dataset(train, batch_size=batch_size)

In [10]:
[(train_features, label_batch)] = train_ds.take(1)

print('Every feature:', list(train_features.keys()))
print('A batch of ages:', train_features['Age'])
print('A batch of targets:', label_batch)

Every feature: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
A batch of ages: tf.Tensor([ 1 12 60  3  3], shape=(5,), dtype=int64)
A batch of targets: tf.Tensor([0 0 0 0 1], shape=(5,), dtype=int32)


### Demonstrate the use of preprocessing layers

We will use three preprocessing layers:
- Normalization - Feature-wise normalization of the data
- CategoryEncoding - Category encoding layer
- StringLookup - Maps strings from vocabulary to integer indices
- IntegerLookup - Maps integers from a vocabulary to integer indices

#### Numeric columns

For numeric columns we use normalization to make sure that the mean of the data is 0 and the standart deviation is 1.

In [11]:
def get_normalization_layer(name, dataset):
    normalizer = preprocessing.Normalization(axis=None)

    feature_ds = dataset.map(lambda x, y : x[name])

    normalizer.adapt(feature_ds)

    return normalizer

In [12]:
photo_count_col = train_features['PhotoAmt']
layer = get_normalization_layer('PhotoAmt', train_ds)

layer(photo_count_col)

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-0.8271376 ,  0.11410153, -0.51339126,  0.11410153, -0.51339126],
      dtype=float32)>

#### Categorical columns

Representing string as a one-hot vector.

In [13]:
def get_categorical_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)

    encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

    return lambda feature: encoder(index(feature))

In [14]:
type_col = train_features['Type']
layer = get_categorical_encoding_layer('Type', train_ds, 'string')

layer(type_col)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 1.], dtype=float32)>

In [15]:
type_col = train_features['Age']
category_encoding_layer = get_categorical_encoding_layer('Age', train_ds, 'int64', 5)

category_encoding_layer(type_col)

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 1., 1., 0.], dtype=float32)>

#### Choose which columns to use

In [16]:
batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [17]:
all_inputs = []
encoded_features = []

for header in ['PhotoAmt', 'Fee']:
    numerical_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numerical_col)
    all_inputs.append(numerical_col)
    encoded_features.append(encoded_numeric_col)

In [18]:
age_col = tf.keras.Input(shape=(1,), name='Age', dtype='int64')
encoding_layer = get_categorical_encoding_layer('Age',train_ds, dtype='int64', max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

In [19]:
categorical_cols = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Breed1']

for header in categorical_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_categorical_encoding_layer(header, train_ds, dtype='string', max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

### Create, compile, and train the model

In [20]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation='relu')(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)

model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

In [23]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir='LR')

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


### Train model

In [24]:
model.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x233750e62b0>

In [25]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7430675625801086


### Inference on new data

In [26]:
model.save('my_pet_classifier')
reloaded_model = tf.keras.models.load_model('my_pet_classifier')



INFO:tensorflow:Assets written to: my_pet_classifier\assets


INFO:tensorflow:Assets written to: my_pet_classifier\assets


To get a prediction for a new sample, you simply call model.predict. There are just two things you need to do:
1. Wrap scalers into a list so as to have a batch dimention (models only process batches of data, not single samples)
2. Call convert_to_tensor on each feature

In [29]:
sample = {
    'Type':'Cat',
    'Age': 3,
    'Breed1':'Tabby',
    'Gender':'Male',
    'Color1':'Black',
    'Color2':'White',
    'MaturitySize':'Small',
    'FurLength':'Short',
    'Vaccinated':'No',
    'Sterilized':'No',
    'Health':'Healthy',
    'Fee':100,
    'PhotoAmt':2,
}

input_dict = {name:tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = reloaded_model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])

print('This particular pet has a %.1f percent probability of getting adopted'%(100*prob))

This particular pet has a 77.4 percent probability of getting adopted
