# Overview

This notebook uses a pre-trained language model to perform sentiment analysis on IMDB movie reviews using transfer learning.

# Imports

In [1]:
import random
import numpy as np

import torch

from datasets import load_dataset

from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
# Set a seed so that results can be reproduced
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)               # PyTorch CPU
    torch.cuda.manual_seed(seed)          # PyTorch GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [3]:
# Specify a directory to save/load the model
MODEL_DIR = "./model"

In [4]:
# Set the model name to ensure we have compatible tokenizer and model.
MODEL_NAME = "bert-base-uncased"

# Data

In [5]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [6]:
dataset = load_dataset("IMDB")

def tokenize(batch):
    # Padding adds a special character to make shorter entries longer.
    # Truncation shortens entries which are too long.
    # Doing both of these ensures all entries have the same size.
    return tokenizer(batch['text'], padding=True, truncation=True)

# The encoded dataset is a tokenized version of the dataset which
# will be used for training.
dataset_encoded = dataset.map(tokenize, batched=True)

# Model

In [7]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, id2label={0: "NEGATIVE", 1: "POSITIVE"})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train

In [8]:
# Parameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=learning_rate,
)

In [10]:
from transformers import Trainer

from sklearn.metrics import accuracy_score

# Define "compute_metrics" function so we can get an accuracy score.
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    
    return {
        "accuracy": accuracy_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    compute_metrics=compute_metrics
)

In [11]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
500,0.4184
1000,0.2554
1500,0.224
2000,0.1624
2500,0.1457
3000,0.1384
3500,0.0899
4000,0.0808
4500,0.0811


TrainOutput(global_step=4689, training_loss=0.1726846092925394, metrics={'train_runtime': 4158.2047, 'train_samples_per_second': 18.037, 'train_steps_per_second': 1.128, 'total_flos': 1.9733329152e+16, 'train_loss': 0.1726846092925394, 'epoch': 3.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.250809907913208,
 'eval_accuracy': 0.94276,
 'eval_runtime': 444.4,
 'eval_samples_per_second': 56.256,
 'eval_steps_per_second': 3.517,
 'epoch': 3.0}

# Test Model

This section is to test a trained model.  It is set up such that we don't have to execute the training steps if we've previously saved a model.

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load the model and tokenizer
test_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
test_model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

In [14]:
pipe = pipeline("text-classification", model=test_model, tokenizer=test_tokenizer, device=-1)

Device set to use cpu


In [15]:
tests = [
    "I loved this movie!",
    "This movie was bad!"
]

for text in tests:
    output = pipe(text)
    print(f"Input : {text}")
    print(f"Output: {output}")
    print()

Input : I loved this movie!
Output: [{'label': 'POSITIVE', 'score': 0.9971131086349487}]

Input : This movie was bad!
Output: [{'label': 'NEGATIVE', 'score': 0.9984671473503113}]

