This notebook implements a two category sentiment classification on the Stanford Sentiment Treebank dataset (SST-2). We fine tune BERT with a linear classifier on the output of the [CLS] input token.

In [None]:
!pip install transformers

In [2]:
from nltk.tree import Tree
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Now we download the Stanford Sentiment Treebank data set.

In [3]:
import requests, zipfile, io
r = requests.get( 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' )
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

The data is in a tree format with 5 categories of sentiment. The following function reads in the data and flattens each tree into a string. It also reduces the 5 categories to just 2 (postive and negative).

In [4]:
def sentiment_treebank_reader(filename):
    with open(filename, encoding='utf8') as f:
        X, y = [], []
        for line in f:
            tree = Tree.fromstring(line)
            label = int(tree.label())
            string = " ".join(tree.leaves())

            if label == 0 or label == 1: 
                y.append(0)
                X.append(string)
                
            elif label == 3 or label == 4:
                y.append(1)
                X.append(string)
    return X, y

In [5]:
X_str_dev, y_dev = sentiment_treebank_reader('trees/dev.txt')
X_str_train, y_train = sentiment_treebank_reader('trees/train.txt')
X_str_test, y_test = sentiment_treebank_reader('trees/test.txt')

We now define a dataset class in which the data consists of Bert-tokenized versions of the string data we just read in.

In [6]:
class SentimentDataset(Dataset):
    def __init__(self, strings, labels):
        self.strings = strings
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        
    def __getitem__(self, index):
        string = self.strings[index]
        label = self.labels[index]
        
        encoding = self.tokenizer.encode_plus(string, 
                    add_special_tokens=True, return_attention_mask=True, padding='max_length')
        return (
                torch.tensor(encoding['input_ids']).to(device), 
                torch.tensor(encoding['attention_mask']).to(device), 
                torch.tensor(label, dtype=torch.long).to(device)
        )
    
    def __len__(self):
        return len(self.strings)

In [None]:
train_set = SentimentDataset(X_str_train, y_train)
dev_set = SentimentDataset(X_str_dev, y_dev)
test_set = SentimentDataset(X_str_test, y_test)

train_loader = DataLoader(train_set, batch_size=16)
dev_loader = DataLoader(dev_set, batch_size=16)
test_loader = DataLoader(test_set, batch_size=16)

The model is Bert with a classifier on the [CLS] output. We will then fine-tune the entire model on the training data.

In [8]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.classifier_layer = nn.Linear(768, 2)
    
    def forward(self, indices, mask):
        cls_output = self.bert(indices, attention_mask=mask)['pooler_output']
        return self.classifier_layer(cls_output)

In [None]:
model = SentimentClassifier()
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

This helper function evaluates the loss and accuracy on a given dataset.

In [10]:
def evaluate_model(model, data_loader, criterion, length):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        correct_predictions = 0
        for inputs, masks, labels in data_loader:
            outputs = model(inputs, masks)
            _, preds = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)
            correct_predictions += torch.sum(preds == labels)
            total_loss += loss.item()
    return total_loss, correct_predictions.item() / length

Here is the training loop.

In [11]:
for epoch in range(2):
    print('Epoch', epoch + 1)
    model.train()

    for inputs, masks, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs, masks)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    train_loss, train_acc = evaluate_model(model, train_loader, criterion, len(train_set))
    dev_loss, dev_acc = evaluate_model(model, dev_loader, criterion, len(dev_set))
    print('Train loss', train_loss, '  accuracy', train_acc)
    print('Dev   loss', dev_loss, '  accuracy', dev_acc)

Epoch 1
Train loss 212.257440879941   accuracy 0.8419075144508671
Dev   loss 27.262029767036438   accuracy 0.8245412844036697
Epoch 2
Train loss 84.08612885326147   accuracy 0.9209537572254335
Dev   loss 17.401812944561243   accuracy 0.8658256880733946


In [12]:
_, test_acc = evaluate_model(model, test_loader, criterion, len(test_set))
print('Test accuracy', test_acc)

Test accuracy 0.870950027457441


You can see how this compares with the sota over time [here.](https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary)