In [52]:
import pandas as pd
import pprint
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


In [53]:
csv_file = tf.keras.utils.get_file('hyad.csv', 'file:///tf/notebooks/hyad.csv')

In [54]:
df = pd.read_csv(csv_file)

In [55]:
df.head()

Unnamed: 0,Item,Category,Subcategory
0,"Store Brand Chunky Cinnamon Bread, 1 each",Bakery,Bread
1,"Store Brand French Bread, 16 oz",Bakery,Bread
2,"Store Brand Rye Bread, 1 each",Bakery,Bread
3,"Store Brand Vienna Bread, 16 oz",Bakery,Bread
4,"Store Brand Wheat Bread, 16 oz",Bakery,Bread


In [56]:
# we use a CsvDataset to create the dataset without creating this all on the file system
# as the example shows
tf_csv = tf.data.experimental.CsvDataset(
    ['file:///tf/notebooks/hyad.csv'], 
    record_defaults=["", "", ""], 
    compression_type=None, 
    buffer_size=None, 
    header=True, 
    field_delim=',',
    use_quote_delim=True
)

In [57]:
# TODO: I need to assign numbers for each unique category, then store it for safe keeping

# get all unique categories
ALL_CATEGORIES_SUB_CATEGORIES = set([])
for element in tf_csv.as_numpy_iterator():
    ALL_CATEGORIES_SUB_CATEGORIES.add(f"{element[1].decode('utf-8')}|{element[2].decode('utf-8')}")

# now create a dict of String to Int
ALL_CAT_SUB_CAT_TO_INT = { cat: i for (i, cat) in enumerate(ALL_CATEGORIES_SUB_CATEGORIES)}

# pprint.pprint(ALL_CAT_SUB_CAT_TO_INT)

In [58]:
# convert tf_csv to a numpy 
numpy_slices = list(tf_csv.as_numpy_iterator())

def fix(x):
    key = "|".join([x[1].decode('utf-8'), x[2].decode('utf-8')])
    return ( x[0], ALL_CAT_SUB_CAT_TO_INT[key] )
# map this into a new numpy_slices with the numeric code for the category
new_numpy_slices = list(map(fix , numpy_slices))

text_list = list(map(lambda x: x[0], new_numpy_slices))
label_list = list(map(lambda x: x[1], new_numpy_slices))

label_max_val = max(label_list)
print(f"Max label value: {label_max_val}")

text_ds = tf.data.Dataset.from_tensor_slices(text_list)
label_ds = tf.data.Dataset.from_tensor_slices(label_list)
zipped_ds = tf.data.Dataset.zip((text_ds, label_ds))

Max label value: 221


In [59]:
# first we want to shuffle the dataset so that we have a good batch of different items
# we'll shuffle using a seed of 3, and every time repeat() is run we'll get another shuffle
# tf_csv = tf_csv.shuffle(3, reshuffle_each_iteration=True)

# create our train, validate, and test buckets
train_ds = zipped_ds.batch(500).take(320)
validate_ds = zipped_ds.batch(500).skip(320).take(80)
test_ds = zipped_ds.batch(500).skip(400).take(100)

In [60]:
for item_batch, cat_id_batch in train_ds.take(1):
    for i in range(10):
        print("Item: ", item_batch.numpy()[i])
        print("Category ID: ", cat_id_batch.numpy()[i])


Item:  b'Store Brand Chunky Cinnamon Bread, 1 each'
Category ID:  15
Item:  b'Store Brand French Bread, 16 oz'
Category ID:  15
Item:  b'Store Brand Rye Bread, 1 each'
Category ID:  15
Item:  b'Store Brand Vienna Bread, 16 oz'
Category ID:  15
Item:  b'Store Brand Wheat Bread, 16 oz'
Category ID:  15
Item:  b'Store Brand White Bread, 1 each'
Category ID:  15
Item:  b'Store Brand Shells 4 cnt Dessert, 3.25 oz'
Category ID:  212
Item:  b'Store Brand Angel Food Cake, 13 oz'
Category ID:  212
Item:  b'Store Brand Brownies, 1 each'
Category ID:  212
Item:  b'Store Brand Cookies Chocolate Chip 12 cnt, 1 each'
Category ID:  212


In [61]:
# take 1 - use a bag-of-words model
VOCAB_SIZE = 10000

# we create a binary layer based on the vocab
binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')

# we create another layer based on the sequence length
MAX_SEQUENCE_LENGTH = 250
int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [62]:
# train the layers for just category
train_text = train_ds.take(1).map(lambda item, cat_id: item)

binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [63]:
def binary_vectorize_text(text, cat):
  text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), cat

def int_vectorize_text(text, cat):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), cat

In [64]:
# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, cat_batch = next(iter(train_ds))
first_item, first_cat = text_batch[0], cat_batch[0]
print("Item", first_item)
print("Category", first_cat)

Item tf.Tensor(b'Store Brand Chunky Cinnamon Bread, 1 each', shape=(), dtype=string)
Category tf.Tensor(15, shape=(), dtype=int32)


In [65]:
print("'binary' vectorization label: ", binary_vectorize_text(first_item, first_cat)[0])
print("'int' vectorization label: ", int_vectorize_text(first_item, first_cat)[0])

'binary' vectorization label:  tf.Tensor([[0. 0. 0. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)
'int' vectorization label:  tf.Tensor(
[[  5   6 630  46  12  27  37   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0 

In [66]:
print("9 ---> ", int_vectorize_layer.get_vocabulary()[9])
print("3 ---> ", int_vectorize_layer.get_vocabulary()[3])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

9 --->  juice
3 --->  12
Vocabulary size: 727


In [67]:
# Apply layers to the three buckets of data
binary_train_ds = train_ds.map(binary_vectorize_text)
binary_val_ds = validate_ds.map(binary_vectorize_text)
binary_test_ds = test_ds.map(binary_vectorize_text)

int_train_ds = train_ds.map(int_vectorize_text)
int_val_ds = validate_ds.map(int_vectorize_text)
int_test_ds = test_ds.map(int_vectorize_text)

In [68]:
# performance tuning
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

In [70]:
# Train a simple bag-of-words linear model
binary_model = tf.keras.Sequential([layers.Dense(label_max_val + 1)])
binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])
history = binary_model.fit(
    binary_train_ds, validation_data=binary_val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
