## Setup

In [1]:
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import tqdm

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

## Data loading

Data comes from here: https://www.kaggle.com/hijest/genre-classification-dataset-imdb.

In [2]:
train_df = pd.read_csv(
    "./data/train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "./data/test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

In [3]:
# Viewing training data
train_df.head()

Unnamed: 0,id,movie,genre,summary
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


## Data splitting

In [4]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1)
train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)

print(f"Number of training samples: {len(train_df_new)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

Number of training samples: 48792.
Number of validation samples: 5422.
Number of test examples: 54200.


## Using [`bucket_by_sequence_length()`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#bucket_by_sequence_length)

In [5]:
passage = train_df_new["summary"][0]
word_splits = tf.strings.split(passage, sep=" ")
tf.shape(word_splits)[0]

2021-12-19 15:55:28.592850: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.Tensor: shape=(), dtype=int32, numpy=92>

In [6]:
dataset = tf.data.Dataset.from_tensor_slices(train_df_new["summary"])

for sample in dataset.take(1):
    print(sample)

tf.Tensor(b"On a lonely stretch of a highway, Ronit - pulls up in to a desolate pump to fill petrol. The attendant informs him that his car's fan belt is broken and a new one will only arrive in the morning. Stuck in the middle of nowhere, Ronit prepares to stake the night out in his car. When another car pulls in. The driver is a dignified, well-spoken man who lives a few miles away. He offers to house Ronit for the night, promising to drop him back in the morning. Ronit agrees, believing there is a god. But then there is also the devil.", shape=(), dtype=string)


In [7]:
label_encoder = keras.layers.StringLookup(vocabulary=train_df_new["genre"].unique())
label_encoder.get_vocabulary()

['[UNK]',
 'short',
 'sci-fi',
 'documentary',
 'drama',
 'thriller',
 'comedy',
 'adult',
 'romance',
 'adventure',
 'western',
 'family',
 'talk-show',
 'news',
 'horror',
 'history',
 'music',
 'sport',
 'war',
 'animation',
 'game-show',
 'action',
 'crime',
 'reality-tv',
 'mystery',
 'musical',
 'fantasy',
 'biography']

In [8]:
train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()
vocabulary_size = train_df_new["total_words"].max()
vocabulary_size

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()


1829

In [9]:
text_vectorizer = keras.layers.TextVectorization(ngrams=2, output_mode="tf_idf")

# `TextVectorization` layer needs to be adapted as per the vocabulary from our
# training set.
dataset_ = tf.data.Dataset.from_tensor_slices(
        (train_df_new["summary"].values, train_df_new["genre"].values)
    )
with tf.device("/CPU:0"):
    text_vectorizer.adapt(dataset_.map(lambda text, label: text))

In [10]:
def preprocess_batch(summary, label):
    summary = text_vectorizer(summary)
    label = label_encoder(label)
    return summary, label


def prepare_dataset(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"].values, dataframe["genre"].values)
    )
    dataset = dataset.bucket_by_sequence_length(
        element_length_func=lambda elem, label: tf.shape(tf.strings.split(elem, sep=" "))[0],
        bucket_boundaries=[512],
        bucket_batch_sizes=[32, 32],
    )
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.prefetch(tf.data.AUTOTUNE)

In [11]:
training_dataset = prepare_dataset(train_df_new)

In [12]:
for sample_batch in training_dataset.take(10):
    print(sample_batch[0].shape)
    print(sample_batch[1].shape)

(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
(32, 1702942)
(32,)
