In [1]:
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import tqdm

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [2]:
train_df = pd.read_csv(
    "./data/train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "./data/test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

In [3]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1)
train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)

print(f"Number of training samples: {len(train_df_new)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

Number of training samples: 48792.
Number of validation samples: 5422.
Number of test examples: 54200.


In [4]:
train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()
vocabulary_size = int(train_df_new["total_words"].max())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_new["total_words"] = train_df_new["summary"].str.split().str.len()


In [5]:
text_vectorizer = keras.layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)
text_vectorizer.adapt(train_df_new["summary"])

label_encoder = keras.layers.StringLookup(vocabulary=train_df_new["genre"].unique())
label_encoder.get_vocabulary()

2021-12-20 14:08:42.829672: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-20 14:08:42.879975: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


['[UNK]',
 'short',
 'sci-fi',
 'documentary',
 'drama',
 'thriller',
 'comedy',
 'adult',
 'romance',
 'adventure',
 'western',
 'family',
 'talk-show',
 'news',
 'horror',
 'history',
 'music',
 'sport',
 'war',
 'animation',
 'game-show',
 'action',
 'crime',
 'reality-tv',
 'mystery',
 'musical',
 'fantasy',
 'biography']

In [10]:
batch_size = 32
auto = tf.data.AUTOTUNE


def prepare_dataset(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"], dataframe["genre"])
    )
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda summaries, genres: (text_vectorizer(summaries), label_encoder(genres)),
        num_parallel_calls=auto,
    )
    return dataset.prefetch(auto)

In [11]:
training_dataset = prepare_dataset(train_df_new)
validation_dataset = prepare_dataset(val_df)
test_dataset = prepare_dataset(test_df)


for sequences, labels in training_dataset.take(1):
    print(sequences.shape[0], labels.shape[0])

32 32


In [12]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            keras.layers.Dense(512, activation="relu"),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.Dense(label_encoder.vocabulary_size(), activation="softmax"),
        ]
    )
    return shallow_mlp_model

In [14]:
epochs = 20

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

history = shallow_mlp_model.fit(
    training_dataset, validation_data=validation_dataset, epochs=epochs
)

_, accuracy = shallow_mlp_model.evaluate(test_dataset)
print(f"Top-1 accuracy on the test set: {round(accuracy * 100, 2)}%.")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Top-1 accuracy on the test set: 48.92%.
