## Setup

In [None]:
!pip install -q wandb

In [None]:
!gdown --id 1CvkRnGC8b_-n1NcbwcwxcIq7SusmDMb5 -O train_data.txt
!gdown --id 1h1evGF5NVi2p8RoWxl8xhpOod0ZN_-ky -O test_data_solution.txt 

In [None]:
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import tqdm
import time
import wandb

## Constants

In [None]:
AUTO = tf.data.AUTOTUNE
BATCH_SIZE = 128
EPOCHS = 60
NUM_TRIALS = 5
LR_DECAY_PATIENCE = 5

VAL_SPLIT = 0.1
SPLIT_SEED = 42

## Data loading

In [None]:
train_df = pd.read_csv(
    "train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

In [None]:
# Viewing training data
train_df.head()

## Data splitting

In [None]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1)
train_df_new, val_df = train_test_split(
    train_shuffled, test_size=VAL_SPLIT, random_state=SPLIT_SEED
)

print(f"Number of training samples: {len(train_df_new)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

## Data preprocessing

In [None]:
text_vectorizer = keras.layers.TextVectorization()
text_vectorizer.adapt(train_df_new["summary"])

In [None]:
for i in train_df_new.index.tolist()[:10]:
    print(text_vectorizer(train_df_new["summary"][i]).shape)

In [None]:
train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()
max_seqlen = int(train_df_new["total_words"].max())
max_seqlen

In [None]:
label_encoder = keras.layers.StringLookup(vocabulary=train_df_new["genre"].unique())
label_encoder.get_vocabulary()

In [None]:
def preprocess_single_row(summary, label):
    summary = text_vectorizer(summary)
    label = label_encoder(label)
    return summary, label


def prepare_dataset(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"].values, dataframe["genre"].values)
    )
    dataset = dataset.map(preprocess_single_row, num_parallel_calls=AUTO)
    dataset = dataset.padded_batch(BATCH_SIZE)
    return dataset.prefetch(AUTO)

In [None]:
training_dataset = prepare_dataset(train_df_new)
validation_dataset = prepare_dataset(val_df)
test_dataset = prepare_dataset(test_df)


for sample_batch in training_dataset.take(10):
    print(sample_batch[0].shape)
    print(sample_batch[1].shape)

## Model utilities

In [None]:
def make_model(use_gru=False):
    inputs = keras.Input(shape=(None,), dtype="int64")
    x = keras.layers.Embedding(
        input_dim=text_vectorizer.vocabulary_size(),
        output_dim=16,
    )(inputs)
    
    if use_gru:
        x = keras.layers.Bidirectional(keras.layers.GRU(8))(x)
    else:
        x = keras.layers.GlobalAveragePooling1D()(x)
    x = keras.layers.Dense(512, activation="relu")(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    outputs = keras.layers.Dense(label_encoder.vocabulary_size(), activation="softmax")(
        x
    )
    shallow_mlp_model = keras.Model(inputs, outputs)
    return shallow_mlp_model

In [None]:
make_model().summary()

## Training and evaluation with smart batching

In [None]:
for i in range(NUM_TRIALS):
    wandb.init(
        project="smart-batching-simpler-models",
        entity="carted",
        name=f"smart-batching-run-{i}",
    )
    shallow_mlp_model = make_model()
    shallow_mlp_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience=LR_DECAY_PATIENCE),
        wandb.keras.WandbCallback(),
    ]

    start_time = time.time()
    history = shallow_mlp_model.fit(
        training_dataset,
        validation_data=validation_dataset,
        epochs=EPOCHS,
        callbacks=callbacks,
    )
    end_time = time.time()
    print(f"Model took {(end_time - start_time):.2f} seconds to train.")
    wandb.log({"model_training_time_secs": end_time - start_time})

    _, accuracy = shallow_mlp_model.evaluate(test_dataset)
    print(f"Top-1 accuracy on the test set: {(accuracy * 100):0.2f}%.")
    wandb.log({"top_1_accuracy_test_set": accuracy})

    wandb.finish()

### With GRU

In [None]:
for i in range(NUM_TRIALS):
    wandb.init(
        project="smart-batching-simpler-models",
        entity="carted",
        name=f"smart-batching-with-gru-run-{i}",
    )
    shallow_mlp_model = make_model(use_gru=True)
    shallow_mlp_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience=LR_DECAY_PATIENCE),
        wandb.keras.WandbCallback(),
    ]

    start_time = time.time()
    history = shallow_mlp_model.fit(
        training_dataset,
        validation_data=validation_dataset,
        epochs=EPOCHS,
        callbacks=callbacks,
    )
    end_time = time.time()
    print(f"Model took {(end_time - start_time):.2f} seconds to train.")
    wandb.log({"model_training_time_secs": end_time - start_time})

    _, accuracy = shallow_mlp_model.evaluate(test_dataset)
    print(f"Top-1 accuracy on the test set: {(accuracy * 100):0.2f}%.")
    wandb.log({"top_1_accuracy_test_set": accuracy})

    wandb.finish()

## Training and evaluation with fixed-length padding

In [None]:
text_vectorizer = keras.layers.TextVectorization(output_sequence_length=max_seqlen)
text_vectorizer.adapt(train_df_new["summary"])


def preprocess_fixed_length(summary, label):
    summary = text_vectorizer(summary)
    label = label_encoder(label)
    return summary, label


def prepare_dataset_fixed_length(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"].values, dataframe["genre"].values)
    )
    dataset = dataset.map(preprocess_fixed_length, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset.prefetch(AUTO)

In [None]:
training_dataset = prepare_dataset_fixed_length(train_df_new)
validation_dataset = prepare_dataset_fixed_length(val_df)
test_dataset = prepare_dataset_fixed_length(test_df)


for sample_batch in training_dataset.take(10):
    print(sample_batch[0].shape)
    print(sample_batch[1].shape)

In [None]:
for i in range(NUM_TRIALS):
    wandb.init(
        project="smart-batching-simpler-models",
        entity="carted",
        name=f"fixed-length-padding-run-{i}",
    )
    shallow_mlp_model = make_model()
    shallow_mlp_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience=LR_DECAY_PATIENCE),
        wandb.keras.WandbCallback(),
    ]

    start_time = time.time()
    history = shallow_mlp_model.fit(
        training_dataset,
        validation_data=validation_dataset,
        epochs=EPOCHS,
        callbacks=callbacks,
    )
    end_time = time.time()
    print(f"Model took {(end_time - start_time):.2f} seconds to train.")
    wandb.log({"model_training_time_secs": end_time - start_time})

    _, accuracy = shallow_mlp_model.evaluate(test_dataset)
    print(f"Top-1 accuracy on the test set: {(accuracy * 100):0.2f}%.")
    wandb.log({"top_1_accuracy_test_set": accuracy})

    wandb.finish()

### With GRU

In [None]:
for i in range(NUM_TRIALS):
    wandb.init(
        project="smart-batching-simpler-models",
        entity="carted",
        name=f"fixed-length-padding-with-gru-run-{i}",
    )
    shallow_mlp_model = make_model(use_gru=True)
    shallow_mlp_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience=LR_DECAY_PATIENCE),
        wandb.keras.WandbCallback(),
    ]

    start_time = time.time()
    history = shallow_mlp_model.fit(
        training_dataset,
        validation_data=validation_dataset,
        epochs=EPOCHS,
        callbacks=callbacks,
    )
    end_time = time.time()
    print(f"Model took {(end_time - start_time):.2f} seconds to train.")
    wandb.log({"model_training_time_secs": end_time - start_time})

    _, accuracy = shallow_mlp_model.evaluate(test_dataset)
    print(f"Top-1 accuracy on the test set: {(accuracy * 100):0.2f}%.")
    wandb.log({"top_1_accuracy_test_set": accuracy})

    wandb.finish()