<a href="https://colab.research.google.com/github/adamelkholyy/imdb-sentiment-analysis/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT for Sentiment Analysis on IMDB Movie Review Dataset
<strong>Adam El Kholy</strong> \
<strong>University of Bath</strong> \
Last Updated: <strong>02/12/2023</strong>

Free to use under the Apache 2.0 license \
For use in Google Colab with the [IMDB reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) dataset available on Kaggle 

The following notebook allows you to train and evaluate the BERT cased and uncased models for the task of sentiment analysis using on the IMDB movie dataset

See ```main.ipynb``` for further model evaluation on the IMDB dataset

In [None]:
import os
import torch
import numpy as np

# Loading the Dataset

In [None]:
def read_corpus(directory):
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    corpus = []
    for file in files:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            document = f.read()
            corpus.append(document)
    return corpus

In [2]:
# unpack data from drive to google Colab due 
from google.colab import drive

drive.mount('/content/drive')
positive_corpus = read_corpus("drive/MyDrive/nlp/data/pos/")
negative_corpus = read_corpus("drive/MyDrive/nlp/data/neg/")
corpus = positive_corpus + negative_corpus
positive_labels = len(positive_corpus)
negative_labels = len(negative_corpus)
corpus_length = len(corpus)

# sanity check
print(positive_corpus[0][:202])
print(negative_corpus[1][:202])

I've long wanted to see this film, being a fan of both Peter Cushing and David McCallum. I agree that the romantic sub-plot was a waste of time, but the talent of McCallum shines through this juvie role
While this film certainly does possess the stench of a bad film, it's surprisingly watchable on several levels. First, for old movie fans, it's interesting to see the leading role played by Dean Jagger 


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

r_seed = 563
np.random.seed(r_seed)

def get_test_train_dev_split(X):
    y = np.concatenate([np.ones(positive_labels), np.zeros(negative_labels)])

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80,
                                                        shuffle=True, random_state=r_seed)

    X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train,
                                                        test_size=0.15,
                                                       shuffle=True, random_state=r_seed)

    # 68% train, 12% validation, 20% test
    return X_train, y_train, X_test, y_test, X_dev, y_dev

# Training BERT

<strong>References</strong> <br>
[1] https://huggingface.co/transformers/v3.2.0/custom_datasets.html <br>
[2] https://huggingface.co/docs/transformers/tasks/sequence_classification <br>

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
X_train, y_train, X_test, y_test, X_dev, y_dev = get_test_train_dev_split(corpus)

In [None]:
import accelerate
import transformers
from transformers import DistilBertTokenizerFast

"""
# uncased tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') 
"""

# tokenise and encode the dataset
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_dev, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
# loads the set into the class wrapper for use by BERT
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, y_train)
val_dataset = IMDbDataset(val_encodings, y_dev)
test_dataset = IMDbDataset(test_encodings, y_test)

In [None]:
train_dataset[1]

{'input_ids': tensor([  101,  7595,   146,  1189,   170,  6223,  1105,   146,  3004,   128,
         11854,  1116,  1120,  1103,  2523,  5184,  1106,  2824,  1142,  8327,
          2764,  2008,  2523,   119,  1422,  1827,   132,   133,  9304,   120,
           135,   133,  9304,   120,   135,  2352,  1110,  1359,  1113,   123,
          1614,   132,   133,  9304,   120,   135,   133,  9304,   120,   135,
           122,   114, 27673,  1348,  1553,  1104, 10344,   131,  1249,  1122,
          5940,  1113,  1211,  1104,  1103,  1237,  9141,   117,  1103,  2432,
          6191,  2111,  1112,  1126,  6640,  1170,  3776,   123,  1137,   124,
          1614,  1164,  1103,  3141,  2754,   119,  1252, 16679,  1122,  1110,
          1136,  1536,   119, 14881,   180, 10038,  8124,  1105,   123,  2666,
          1104,  1168, 11785,  2144,   112,   189,  1294,   170,  1825,  2437,
           170,  2754,   119,  1370,  1859,   188, 22300,  1110,  1103,  2951,
          1104,  4044,  1297,  1107,  3

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [7]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=0.01,              
    logging_dir='./logs',            
    logging_steps=10,
)

"""
# uncased model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") 
"""
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased")

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,        
    eval_dataset=val_dataset             
)

trainer.train()

TrainOutput(
global_step=510, 
training_loss=0.3883705898827198, 
metrics={
    'train_runtime': 374.1151, 
    'train_samples_per_second': 21.811, 
    'train_steps_per_second': 1.363, 
    'total_flos': 1080933973032960.0, 
    'train_loss': 0.3883705898827198, 
    'epoch': 3.0
})


# Evaluating BERT

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def evaluate_model( test_labels, predictions):
    print(f"accuracy: {accuracy_score(test_labels, predictions)}")
    print(f"precision: {precision_score(test_labels, predictions)}")
    print(f"recall: {recall_score(test_labels, predictions)}")
    print(f"f1 score: {f1_score(test_labels, predictions)}")
    print()

In [8]:
# development set evaluation
predictions = trainer.predict(val_dataset)
predicted_classes = predictions.predictions.argmax(axis=1)
evaluate_model(y_dev, predicted_classes)

accuracy: 0.84375
precision: 0.8032786885245902
recall: 0.8789237668161435
f1 score: 0.8394004282655245


In [9]:
# test set evaluation
predictions = trainer.predict(test_dataset)
predicted_classes = predictions.predictions.argmax(axis=1)
evaluate_model(y_test, predicted_classes)

accuracy: 0.8825
precision: 0.8657407407407407
recall: 0.9121951219512195
f1 score: 0.8883610451306413
