In [50]:
import pandas as pd

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


In [51]:
csv_file = tf.keras.utils.get_file('hyad.csv', 'file:///tf/notebooks/hyad.csv')

In [52]:
df = pd.read_csv(csv_file)

In [53]:
df.head()

Unnamed: 0,Item,Category,Subcategory
0,"Store Brand Chunky Cinnamon Bread, 1 each",Bakery,Bread
1,"Store Brand French Bread, 16 oz",Bakery,Bread
2,"Store Brand Rye Bread, 1 each",Bakery,Bread
3,"Store Brand Vienna Bread, 16 oz",Bakery,Bread
4,"Store Brand Wheat Bread, 16 oz",Bakery,Bread


In [54]:
# we use a CsvDataset to create the dataset without creating this all on the file system
# as the example shows
tf_csv = tf.data.experimental.CsvDataset(
    ['file:///tf/notebooks/hyad.csv'], 
    record_defaults=["", "", ""], 
    compression_type=None, 
    buffer_size=None, 
    header=True, 
    field_delim=',',
    use_quote_delim=True
)

In [72]:
# TODO: I need to assign numbers for each unique category, then store it for safe keeping

# get all unique categories
ALL_CATEGORIES_SUB_CATEGORIES = set([])
for element in tf_csv.as_numpy_iterator():
    ALL_CATEGORIES_SUB_CATEGORIES.add(f"{element[1]}|{element[2]}")

# now create a dict of String to Int
ALL_CAT_SUB_CAT_TO_INT = { cat: i for (i, cat) in enumerate(ALL_CATEGORIES_SUB_CATEGORIES)}


In [79]:
# now apply the mapping to create a new numeric dataset
@tf.function
def convert_to_item_number(item_tensor, cat_tensor, sub_cat_tensor):
    
    print(item_tensor)
    print(cat_tensor)
    print(sub_cat_tensor)
    
    item = item_tensor.numpy()
    cat = cat_tensor.numpy()
    sub_cat = sub_cat_tensor.numpy()
    
    print(item)
    print(cat)
    print(sub_cat)
    ##use x to derive additional columns u want. Set the shape as well
    y = {}
    key = f"{cat}|{sub_cat}"
    print(key)
    y['item'] = item
    y['label'] = ALL_CAT_SUB_CAT_TO_INT[key]
    return y

new_dataset = tf_csv.map(convert_to_item_number)

Tensor("item_tensor:0", shape=(), dtype=string)
Tensor("cat_tensor:0", shape=(), dtype=string)
Tensor("sub_cat_tensor:0", shape=(), dtype=string)
()
()
()
()|()


KeyError: in user code:

    <ipython-input-79-4091b80b6e76>:21 convert_to_item_number  *
        y['label'] = ALL_CAT_SUB_CAT_TO_INT[key]

    KeyError: '()|()'


In [55]:
# first we want to shuffle the dataset so that we have a good batch of different items
# we'll shuffle using a seed of 3, and every time repeat() is run we'll get another shuffle
tf_csv = tf_csv.shuffle(3, reshuffle_each_iteration=True)

# create our train, validate, and test buckets
train_ds = tf_csv.batch(500).take(320)
validate_ds = tf_csv.batch(500).skip(320).take(80)
test_ds = tf_csv.batch(500).skip(400).take(100)

In [56]:
for item_batch, cat_batch, sub_cat_batch in train_ds.take(1):
    for i in range(10):
        print("Item: ", item_batch.numpy()[i])
        print("Category: ", cat_batch.numpy()[i])
        print("Subcategory: ", sub_cat_batch.numpy()[i])

Item:  b'Store Brand Chunky Cinnamon Bread, 1 each'
Category:  b'Bakery'
Subcategory:  b'Bread'
Item:  b'Store Brand Rye Bread, 1 each'
Category:  b'Bakery'
Subcategory:  b'Bread'
Item:  b'Store Brand French Bread, 16 oz'
Category:  b'Bakery'
Subcategory:  b'Bread'
Item:  b'Store Brand Vienna Bread, 16 oz'
Category:  b'Bakery'
Subcategory:  b'Bread'
Item:  b'Store Brand Wheat Bread, 16 oz'
Category:  b'Bakery'
Subcategory:  b'Bread'
Item:  b'Store Brand Shells 4 cnt Dessert, 3.25 oz'
Category:  b'Bakery'
Subcategory:  b'Cookies, Cakes & Brownies'
Item:  b'Store Brand White Bread, 1 each'
Category:  b'Bakery'
Subcategory:  b'Bread'
Item:  b'Store Brand Brownies, 1 each'
Category:  b'Bakery'
Subcategory:  b'Cookies, Cakes & Brownies'
Item:  b'Store Brand Angel Food Cake, 13 oz'
Category:  b'Bakery'
Subcategory:  b'Cookies, Cakes & Brownies'
Item:  b'Store Brand M&M Cookies, 12 each'
Category:  b'Bakery'
Subcategory:  b'Cookies, Cakes & Brownies'


In [57]:
# take 1 - use a bag-of-words model
VOCAB_SIZE = 10000

# we create a binary layer based on the vocab
binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')

# we create another layer based on the sequence length
MAX_SEQUENCE_LENGTH = 250
int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [58]:
# train the layers for just category
train_text = train_ds.take(1).map(lambda cat, sub_cat, category: cat)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [59]:
def binary_vectorize_text(text, cat, sub_cat):
  text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), cat

def int_vectorize_text(text, cat, sub_cat):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), cat

In [60]:
# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, cat_batch, sub_cat_batch = next(iter(train_ds))
first_item, first_cat, first_sub_cat = text_batch[0], cat_batch[0], sub_cat_batch[0]
print("Item", first_item)
print("Category", first_cat)

Item tf.Tensor(b'Store Brand French Bread, 16 oz', shape=(), dtype=string)
Category tf.Tensor(b'Bakery', shape=(), dtype=string)


In [61]:
print("'binary' vectorization label: ", binary_vectorize_text(first_item, first_cat, first_sub_cat)[0])
print("'int' vectorization label: ", int_vectorize_text(first_item, first_cat, first_sub_cat)[0])

'binary' vectorization label:  tf.Tensor([[0. 1. 0. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)
'int' vectorization label:  tf.Tensor(
[[  5   6 273  12  13   2   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0 

In [62]:
print("9 ---> ", int_vectorize_layer.get_vocabulary()[9])
print("3 ---> ", int_vectorize_layer.get_vocabulary()[3])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

9 --->  juice
3 --->  12
Vocabulary size: 730


In [63]:
# Apply layers to the three buckets of data
binary_train_ds = train_ds.map(binary_vectorize_text)
binary_val_ds = validate_ds.map(binary_vectorize_text)
binary_test_ds = test_ds.map(binary_vectorize_text)

int_train_ds = train_ds.map(int_vectorize_text)
int_val_ds = validate_ds.map(int_vectorize_text)
int_test_ds = test_ds.map(int_vectorize_text)

In [64]:
# performance tuning
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

In [65]:
# Train a simple bag-of-words linear model
binary_model = tf.keras.Sequential([layers.Dense(4)])
binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])
history = binary_model.fit(
    binary_train_ds, validation_data=binary_val_ds, epochs=10)

Epoch 1/10


UnimplementedError:  Cast string to float is not supported
	 [[node sparse_categorical_crossentropy/Cast (defined at <ipython-input-65-0f89d3313c30>:8) ]] [Op:__inference_train_function_3501]

Function call stack:
train_function
