## References

* https://www.kaggle.com/rohitganji13/film-genre-classification-using-nlp
* Internal (Carted) TFRecord utilities contributed by [Nilabhra Roy Chowdhury](https://www.linkedin.com/in/nilabhraroychowdhury/)

## Setup

In [1]:
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from typing import Callable, Tuple
import pandas as pd
import numpy as np
import random
import tqdm

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

## Data loading

Data comes from here: https://www.kaggle.com/hijest/genre-classification-dataset-imdb.

In [2]:
train_df = pd.read_csv(
    "./data/train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "./data/test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

In [3]:
# Viewing training data
train_df.head()

Unnamed: 0,id,movie,genre,summary
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


## Data splitting

In [4]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1)
train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)

print(f"Number of training samples: {len(train_df_new)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

Number of training samples: 48792.
Number of validation samples: 5422.
Number of test examples: 54200.


In [5]:
le = LabelEncoder()
le.fit(train_df_new["genre"].values) 

train_df_new["genre"] = le.transform(train_df_new["genre"].values)
val_df["genre"] = le.transform(val_df["genre"].values)
test_df["genre"] = le.transform(test_df["genre"].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_new["genre"] = le.transform(train_df_new["genre"].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["genre"] = le.transform(val_df["genre"].values)


## Data preprocessing utilities

In [6]:
def set_tokenizer(preprocessor_path: str) -> Callable:
    """ Decorator to set the desired tokenizer for a tokenizing
        function from a TensorFlow Hub URL.
        
    Arguments:
        preprocessor_path {str} -- URL of the TF-Hub preprocessor.
    
    Returns:
        Callable -- A function with the `tokenizer` attribute set.
    """

    def decoration(func: Callable):
        # Loading the preprocessor from TF-Hub
        preprocessor = hub.load(preprocessor_path)

        # Setting an attribute called `tokenizer` to
        # the passed function
        func.tokenizer = preprocessor.tokenize
        return func

    return decoration

In [7]:
def _bytes_feature(bytes_input: bytes) -> tf.train.Feature:
    """Encodes given data as a byte feature."""
    bytes_list = tf.train.BytesList(value=[bytes_input])
    return tf.train.Feature(bytes_list=bytes_list)


def _int_feature(int_input: int) -> tf.train.Feature:
    """Encoded given data as an integer feature."""
    int64_list = tf.train.Int64List(value=[int_input])
    return tf.train.Feature(int64_list=int64_list)

To know more about these utilities refer to the official guide [here](https://www.tensorflow.org/tutorials/load_data/tfrecord).

In [8]:
@set_tokenizer(
    preprocessor_path="https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)
def _tokenize_text(text: str) -> Tuple[tf.RaggedTensor, int]:
    """Tokenizes text and returns text token and their length."""
    toks = _tokenize_text.tokenizer(tf.constant([text]))
    num_tokens = toks.flat_values.shape[-1]
    return toks, num_tokens


def serialize_composite(rt):
    """Serializes as a Ragged feature."""
    components = tf.nest.flatten(rt, expand_composites=True)
    return tf.io.serialize_tensor(
        tf.stack([tf.io.serialize_tensor(t) for t in components])
    ).numpy()

2021-12-15 14:40:01.367206: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-15 14:40:02.975438: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [9]:
def create_example(row):
    """Creates one TFRecord example."""
    summary = row["summary"]
    label = row["genre"]

    description = bytes(summary, encoding="utf-8")
    description_tokens, description_len = _tokenize_text(summary)

    feature = {
        "summary": _bytes_feature(description),
        "summary_tokens": _bytes_feature(serialize_composite(description_tokens)),
        "summary_tokens_len": _int_feature(description_len),
        "label": _int_feature(label),
    }
    feature = tf.train.Features(feature=feature)
    example = tf.train.Example(features=feature)
    return example


def write_tfrecords(file_name, data):
    """Serializes the data as string."""
    with tf.io.TFRecordWriter(file_name) as writer:
        for i, row in data.iterrows():
            example = create_example(row)
            writer.write(example.SerializeToString())

## Write to TFRecords

In [10]:
TFRECORDS_DIR = "tfrecords"
tf.io.gfile.makedirs(TFRECORDS_DIR)

In [11]:
def write_data(data, chunk_size, files_prefix):
    """Serializes data as TFRecord shards."""
    example_counter = 0
    chunk_count = 1
    for i in tqdm.tqdm(range(0, data.shape[0], chunk_size)):
        chunk = data.iloc[i : i + chunk_size, :]
        file_name = f"{TFRECORDS_DIR}/{files_prefix}-{chunk_count:02d}.tfrecord"
        write_tfrecords(file_name, chunk)
        example_counter += chunk.shape[0]
        chunk_count += 1
    return example_counter

In [12]:
CHUNK_SIZE = 100

In [13]:
train_example_count = write_data(train_df_new, CHUNK_SIZE, "train")
train_example_count

100%|█████████████████████████████████████████| 488/488 [00:54<00:00,  9.01it/s]


48792

In [14]:
val_example_count = write_data(val_df, CHUNK_SIZE, "val")
val_example_count

100%|███████████████████████████████████████████| 55/55 [00:05<00:00,  9.77it/s]


5422

In [15]:
test_example_count = write_data(test_df, CHUNK_SIZE, "test")
test_example_count

100%|█████████████████████████████████████████| 542/542 [00:57<00:00,  9.40it/s]


54200