In [None]:
!pip install transformers datasets evaluate

In [None]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

print(imdb_dataset)

In [None]:
from pprint import pprint

pprint(imdb_dataset["train"][10])

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
def preprocess(reviews):
  return tokenizer(reviews["text"], truncation=True)

tokenized_imdb_dataset = imdb_dataset.map(preprocess, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
subset_size = 1024
steps_per_epoch = subset_size // batch_size
total_train_steps = int(steps_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased",
                                                     num_labels=2,
                                                     id2label=id2label,
                                                     label2id=label2id)

In [None]:
shuffled_train_dataset = tokenized_imdb_dataset["train"].shuffle(seed=42)
shuffled_validation_dataset = tokenized_imdb_dataset["test"].shuffle(seed=42)

tf_train_set = model.prepare_tf_dataset(
    shuffled_train_dataset.select(range(subset_size)),
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

tf_validation_set = model.prepare_tf_dataset(
    shuffled_validation_dataset.select(range(subset_size)),
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
model.summary()

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=[metric_callback])

In [None]:
text = "Saw an early screening of this film at the Tilton Square Theatre in New Jersey, and I was completely blown away. From the opening scene all the way until the credits I never felt bored, which is impressive for a 2 hour and 45 minute film."

inputs = tokenizer(text, return_tensors="tf")
logits = model(**inputs).logits
predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
print("Predicted class:", model.config.id2label[predicted_class_idx.numpy()])