In [1]:
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import tqdm
import time

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [2]:
train_df = pd.read_csv(
    "./data/train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "./data/test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

In [3]:
# Viewing training data
train_df.head()

Unnamed: 0,id,movie,genre,summary
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [4]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1)
train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)

print(f"Number of training samples: {len(train_df_new)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

Number of training samples: 48792.
Number of validation samples: 5422.
Number of test examples: 54200.


In [5]:
text_vectorizer = keras.layers.TextVectorization()
text_vectorizer.adapt(train_df_new["summary"])

2021-12-20 11:02:20.802559: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
for i in range(10):
    print(text_vectorizer(train_df_new["summary"][i]).shape)

(92,)
(32,)
(113,)
(191,)
(106,)
(258,)
(63,)
(59,)
(82,)
(66,)


In [7]:
train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()
vocabulary_size = train_df_new["total_words"].max()
vocabulary_size

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()


1829

In [8]:
label_encoder = keras.layers.StringLookup(vocabulary=train_df_new["genre"].unique())
label_encoder.get_vocabulary()

['[UNK]',
 'short',
 'sci-fi',
 'documentary',
 'drama',
 'thriller',
 'comedy',
 'adult',
 'romance',
 'adventure',
 'western',
 'family',
 'talk-show',
 'news',
 'horror',
 'history',
 'music',
 'sport',
 'war',
 'animation',
 'game-show',
 'action',
 'crime',
 'reality-tv',
 'mystery',
 'musical',
 'fantasy',
 'biography']

In [9]:
batch_size = 32

def preprocess_single_row(summary, label):
    summary = text_vectorizer(summary)
    label = label_encoder(label)
    return summary, label


def prepare_dataset(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"].values, dataframe["genre"].values)
    )
    dataset = dataset.map(preprocess_single_row, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.bucket_by_sequence_length(
        element_length_func=lambda sequence, label: tf.shape(sequence)[0],
        bucket_boundaries=[vocabulary_size],
        bucket_batch_sizes=[batch_size, batch_size],
    )
    return dataset.prefetch(tf.data.AUTOTUNE)

In [10]:
training_dataset = prepare_dataset(train_df_new)
validation_dataset = prepare_dataset(val_df)
test_dataset = prepare_dataset(test_df)


for sample_batch in training_dataset.take(10):
    print(sample_batch[0].shape)
    print(sample_batch[1].shape)

(32, 322)
(32,)
(32, 273)
(32,)
(32, 408)
(32,)
(32, 250)
(32,)
(32, 461)
(32,)
(32, 335)
(32,)
(32, 336)
(32,)
(32, 331)
(32,)
(32, 292)
(32,)
(32, 276)
(32,)


In [11]:
def make_model():
    inputs = keras.Input(shape=(None,), dtype="int64")
    x = keras.layers.Embedding(
        input_dim=text_vectorizer.vocabulary_size(), output_dim=16
    )(inputs)
    x = keras.layers.GlobalAveragePooling1D()(x)
    x = keras.layers.Dense(512, activation="relu")(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    outputs = keras.layers.Dense(label_encoder.vocabulary_size(), activation="softmax")(
        x
    )
    shallow_mlp_model = keras.Model(inputs, outputs)
    return shallow_mlp_model

In [12]:
make_model().summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 16)          2270112   
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 512)               8704      
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 28)                7196      
                                                             

In [13]:
epochs = 5

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

start_time = time.time()
history = shallow_mlp_model.fit(
    training_dataset, validation_data=validation_dataset, epochs=epochs
)
end_time = time.time()
print(f"Model took {(end_time - start_time):.2f} seconds to train.")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model took 128.98 seconds to train.


In [14]:
_, accuracy = shallow_mlp_model.evaluate(test_dataset)
print(f"Top-1 accuracy on the test set: {round(accuracy * 100, 2)}%.")

Top-1 accuracy on the test set: 49.63%.


In [16]:
text_vectorizer = keras.layers.TextVectorization(output_sequence_length=1829)
text_vectorizer.adapt(train_df_new["summary"])


def preprocess_fixed_length(summary, label):
    summary = text_vectorizer(summary)
    label = label_encoder(label)
    return summary, label


def prepare_dataset_fixed_length(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"].values, dataframe["genre"].values)
    )
    dataset = dataset.map(preprocess_fixed_length, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(tf.data.AUTOTUNE)

If we set `output_sequence_length=vocabulary_size` then it leads to:

> ValueError: `output_sequence_length` must be either None or an integer when `output_mode` is 'int'. Received: output_sequence_length=1829

In [17]:
training_dataset = prepare_dataset_fixed_length(train_df_new)
validation_dataset = prepare_dataset_fixed_length(val_df)
test_dataset = prepare_dataset_fixed_length(test_df)


for sample_batch in training_dataset.take(10):
    print(sample_batch[0].shape)
    print(sample_batch[1].shape)

(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)
(32, 1829)
(32,)


In [18]:
shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

start_time = time.time()
history = shallow_mlp_model.fit(
    training_dataset, validation_data=validation_dataset, epochs=epochs
)
end_time = time.time()
print(f"Model took {(end_time - start_time):.2f} seconds to train.")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model took 141.83 seconds to train.


In [19]:
_, accuracy = shallow_mlp_model.evaluate(test_dataset)
print(f"Top-1 accuracy on the test set: {round(accuracy * 100, 2)}%.")

Top-1 accuracy on the test set: 44.91%.
