## References

* https://www.kaggle.com/rohitganji13/film-genre-classification-using-nlp
* Internal (Carted) TFRecord utilities contributed by [Nilabhra Roy Chowdhury](https://www.linkedin.com/in/nilabhraroychowdhury/)

## Setup

In [None]:
!pip install -U sentence-splitter tensorflow-hub tensorflow_text -q

In [None]:
!gdown --id 1CvkRnGC8b_-n1NcbwcwxcIq7SusmDMb5 -O train_data.txt
!gdown --id 1h1evGF5NVi2p8RoWxl8xhpOod0ZN_-ky -O test_data_solution.txt 

In [None]:
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sentence_splitter import split_text_into_sentences
from typing import List, Callable, Tuple, Dict
import pandas as pd
import numpy as np
import random
import tqdm

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

## Data loading

Data comes from here: https://www.kaggle.com/hijest/genre-classification-dataset-imdb.

In [None]:
train_df = pd.read_csv(
    "train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

In [None]:
# Viewing training data
train_df.head()

## Data splitting

In [None]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1.0)
train_df, val_df = train_test_split(train_shuffled, test_size=0.1)

print(f"Number of training samples: {len(train_df)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

In [None]:
le = LabelEncoder()
le.fit(train_df["genre"].values)

train_df["genre"] = le.transform(train_df["genre"].values)
val_df["genre"] = le.transform(val_df["genre"].values)
test_df["genre"] = le.transform(test_df["genre"].values)

## Data preprocessing utilities

In [None]:
def set_tokenizer(preprocessor_path: str) -> Callable:
    """Decorator to set the desired tokenizer for a tokenizing
        function from a TensorFlow Hub URL.

    Arguments:
        preprocessor_path {str} -- URL of the TF-Hub preprocessor.

    Returns:
        Callable -- A function with the `tokenizer` attribute set.
    """

    def decoration(func: Callable):
        # Loading the preprocessor from TF-Hub
        preprocessor = hub.load(preprocessor_path)

        # Setting an attribute called `tokenizer` to
        # the passed function
        func.tokenizer = preprocessor.tokenize
        return func

    return decoration

In [None]:
def _bytes_feature(bytes_input: bytes) -> tf.train.Feature:
    """Encodes given data as a byte feature."""
    bytes_list = tf.train.BytesList(value=[bytes_input])
    return tf.train.Feature(bytes_list=bytes_list)


def _ints_feature(int_input: int) -> tf.train.Feature:
    """Encoded given data as an integer feature."""
    int64_list = tf.train.Int64List(value=int_input)
    return tf.train.Feature(int64_list=int64_list)


def _ragged_feature(
    ragged_input: tf.RaggedTensor, name: str
) -> Dict[str, tf.train.Feature]:
    """Returns a dictionary to represent a single ragged tensor as int64 features."""
    int64_components = {f"{name}_values": _ints_feature(ragged_input.flat_values)}

    # Collecting boundary informations for the ragged dimensions
    for i, d in enumerate(ragged_input.nested_row_splits):
        int64_components[f"{name}_splits_{i}"] = _ints_feature(d)
    return int64_components

To know more about these utilities refer to the official guide [here](https://www.tensorflow.org/tutorials/load_data/tfrecord).

In [None]:
@set_tokenizer(
    preprocessor_path="https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
)
def _tokenize_text(text: List[str]) -> Tuple[tf.RaggedTensor, List[int]]:
    """Tokenizes a list of sentences.
    Args:
        text (List[str]): A list of sentences.
    Returns:
        Tuple[tf.RaggedTensor, List[int]]: Tokenized and indexed sentences, list containing
        the number of tokens per sentence.
    """
    token_list = _tokenize_text.tokenizer(tf.constant(text))
    token_lens = [tokens.flat_values.shape[-1] for tokens in token_list]
    return token_list, token_lens


def get_serialized_text_features(features):
    """Serializes all the Ragged features."""
    tokens = features["tokens"]
    tokens = _ragged_feature(tokens, "summary_sentences")

    lens = features["lens"]
    lens = tf.ragged.constant([lens])
    lens = _ragged_feature(lens, "summary_sentence_lens")

    return tokens, lens

In [None]:
def create_example(row):
    """Creates one TFRecord example."""
    summary = row["summary"]
    label = row["genre"]

    description = bytes(summary, encoding="utf-8")
    description_tokens, description_lens = _tokenize_text(
        split_text_into_sentences(summary, language="en")
    )
    num_sentences = len(description_lens)

    features = {
        "tokens": description_tokens,
        "lens": description_lens,
    }
    text_tokens, text_lens = get_serialized_text_features(features)

    feature = {
        "summary": _bytes_feature(description),
        "summary_num_sentences": _ints_feature([num_sentences]),
        "label": _ints_feature([label]),
    }

    feature.update(text_tokens)
    feature.update(text_lens)

    feature = tf.train.Features(feature=feature)
    example = tf.train.Example(features=feature)
    return example


def write_tfrecords(file_name, data):
    """Serializes the data as string."""
    with tf.io.TFRecordWriter(file_name) as writer:
        for i, row in data.iterrows():
            example = create_example(row)
            writer.write(example.SerializeToString())

## Write to TFRecords

In [None]:
TFRECORDS_DIR = "tfrecords-sentence-splitter"
tf.io.gfile.makedirs(TFRECORDS_DIR)

In [None]:
def write_data(data, chunk_size, files_prefix):
    """Serializes data as TFRecord shards."""
    example_counter = 0
    chunk_count = 1
    for i in tqdm.tqdm(range(0, data.shape[0], chunk_size)):
        chunk = data.iloc[i : i + chunk_size, :]
        file_name = f"{TFRECORDS_DIR}/{files_prefix}-{chunk_count:02d}.tfrecord"
        write_tfrecords(file_name, chunk)
        example_counter += chunk.shape[0]
        chunk_count += 1
    return example_counter

In [None]:
CHUNK_SIZE = 100

In [None]:
train_example_count = write_data(train_df, CHUNK_SIZE, "train")
train_example_count

In [None]:
val_example_count = write_data(val_df, CHUNK_SIZE, "val")
val_example_count

In [None]:
test_example_count = write_data(test_df, CHUNK_SIZE, "test")
test_example_count