In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Training a Shallow Text Classifier with TFIDF Preprocessing

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/carted/handling-variable-length-text-tf/blob/main/bigram-tfidf-shallow-mlp.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/carted/handling-variable-length-text-tf/blob/main/bigram-tfidf-shallow-mlp.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Setup

In [1]:
!gdown --id 1CvkRnGC8b_-n1NcbwcwxcIq7SusmDMb5 -O train_data.txt
!gdown --id 1h1evGF5NVi2p8RoWxl8xhpOod0ZN_-ky -O test_data_solution.txt 

Downloading...
From: https://drive.google.com/uc?id=1CvkRnGC8b_-n1NcbwcwxcIq7SusmDMb5
To: /content/train_data.txt
100% 35.4M/35.4M [00:00<00:00, 165MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1h1evGF5NVi2p8RoWxl8xhpOod0ZN_-ky
To: /content/test_data_solution.txt
100% 35.4M/35.4M [00:00<00:00, 165MB/s] 


In [1]:
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import tqdm

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

## Data loading

In [2]:
train_df = pd.read_csv(
    "train_data.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

test_df = pd.read_csv(
    "test_data_solution.txt",
    engine="python",
    sep=" ::: ",
    names=["id", "movie", "genre", "summary"],
)

## Data splitting

In [3]:
# Split the data using train_test_split from sklearn
train_shuffled = train_df.sample(frac=1)
train_df_new, val_df = train_test_split(train_shuffled, test_size=0.1)

print(f"Number of training samples: {len(train_df_new)}.")
print(f"Number of validation samples: {len(val_df)}.")
print(f"Number of test examples: {len(test_df)}.")

Number of training samples: 48792.
Number of validation samples: 5422.
Number of test examples: 54200.


## Data preprocessing

In [4]:
train_df_new["total_words"] = train_df_new["summary"].apply(lambda x: len(x.split()))
max_seqlen = train_df_new["total_words"].max()
max_seqlen

1829

In [5]:
text_vectorizer = keras.layers.TextVectorization(max_tokens=max_seqlen, ngrams=2, output_mode="tf_idf")
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_df_new["summary"])

label_encoder = keras.layers.StringLookup(vocabulary=train_df_new["genre"].unique())
label_encoder.get_vocabulary()

['[UNK]',
 'short',
 'sci-fi',
 'documentary',
 'drama',
 'thriller',
 'comedy',
 'adult',
 'romance',
 'adventure',
 'western',
 'family',
 'talk-show',
 'news',
 'horror',
 'history',
 'music',
 'sport',
 'war',
 'animation',
 'game-show',
 'action',
 'crime',
 'reality-tv',
 'mystery',
 'musical',
 'fantasy',
 'biography']

In [6]:
batch_size = 64
auto = tf.data.AUTOTUNE


def prepare_dataset(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["summary"], dataframe["genre"])
    )
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda summaries, genres: (text_vectorizer(summaries), label_encoder(genres)),
        num_parallel_calls=auto,
    ).cache()
    return dataset.prefetch(auto)

In [7]:
training_dataset = prepare_dataset(train_df_new)
validation_dataset = prepare_dataset(val_df)
test_dataset = prepare_dataset(test_df)


for sequences, labels in training_dataset.take(1):
    print(sequences.shape, labels.shape)

(64, 1829) (64,)


## Model utilities

In [8]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            keras.layers.Dense(512, activation="relu"),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.Dense(label_encoder.vocabulary_size(), activation="softmax"),
        ]
    )
    return shallow_mlp_model

## Training and evaluation

In [9]:
epochs = 60

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

history = shallow_mlp_model.fit(
    training_dataset, validation_data=validation_dataset, epochs=epochs
)

_, accuracy = shallow_mlp_model.evaluate(test_dataset)
print(f"Top-1 accuracy on the test set: {round(accuracy * 100, 2)}%.")

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
Top-1 accuracy on the test set: 47.1%.
