## Sentiment Analysis with BERT

In [2]:
import pandas as pd
import numpy as np

In [1]:
import torch
import torch.nn as nn

from tqdm import tqdm

from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

## Load data

In [2]:
class CDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'label': self.labels[idx]}

In [11]:
subset = (90000, 10000)

In [3]:
def read_file(file_path, size=None):
    with open(file_path, 'r') as file:
        if size is None:
            return file.readlines()
        else:
            return file.readlines()[:size]

train_data = read_file("dataset/train.ft.txt", size=subset[0])
test_data = read_file("dataset/test.ft.txt", size=subset[1])

In [12]:
def row_split(text):
    return text[11:], int(text[9])-1

seed = 42

train_text, train_y = [None]*len(train_data), [None]*len(train_data)

for i, row in enumerate(train_data):
    train_text[i], train_y[i] = row_split(row)

test_text, test_y = [None]*len(test_data), [None]*len(test_data)

for i, row in enumerate(test_data):
    test_text[i], test_y[i] = row_split(row)

train_text, val_text, train_y, val_y = train_test_split(train_text, train_y, test_size=subset[1], random_state=seed, stratify=train_y)

# tokenizer

In [31]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def create_dataset(texts, y):
    tokenized_texts = []

    for text in tqdm(texts):
        tokenized_texts.append(tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt'))

    input_ids = torch.cat([t['input_ids'] for t in tokenized_texts], dim=0)
    attention_mask = torch.cat([t['attention_mask'] for t in tokenized_texts], dim=0)
    labels = torch.tensor(y)

    dataset = TensorDataset(input_ids, attention_mask, labels)
    
    return dataset


In [16]:
train_dataset = create_dataset(train_text, train_y)
val_dataset = create_dataset(val_text, val_y)

100%|██████████| 80000/80000 [02:18<00:00, 579.53it/s]
100%|██████████| 10000/10000 [00:17<00:00, 578.50it/s]


In [23]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [34]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [28]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_y), y=train_y)

weights= torch.tensor(class_weights, dtype=torch.float)

weights = weights.to(device)

In [None]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dataloader, desc="Train"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader, device):
    model.eval()

    all_labels = []
    all_preds = []

    total_loss = 0.0
    cor_preds, total = 0, 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            cor_preds += torch.sum(preds == labels).item()
            total += labels.size(0)

    print(classification_report(preds.cpu().numpy(), labels.cpu().numpy()))
    return total_loss / len(dataloader), cor_preds / total

In [25]:
lr = 1e-4

optimizer = AdamW(model.parameters(), lr=lr)

scheduler = StepLR(optimizer, step_size=1, gamma=0.1)



In [25]:
epochs = 5
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, device)
    val_loss, val_accuracy = evaluate(model, val_loader, device)
    scheduler.step(val_accuracy)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.2f}, Val Loss: {val_loss:.2f}, Val Accuracy: {val_accuracy:.2%}")
    model.save_pretrained(f"./fine_tuned_bert_epoch_{epoch+1}")

100%|██████████| 1667/1667 [11:12:14<00:00, 24.20s/it]  
100%|██████████| 209/209 [40:29<00:00, 11.63s/it]


              precision    recall  f1-score   support

           0       1.00      0.90      0.95        10
           1       0.86      1.00      0.92         6

    accuracy                           0.94        16
   macro avg       0.93      0.95      0.94        16
weighted avg       0.95      0.94      0.94        16

Epoch 1/2 - Train Loss: 0.0535 - Val Loss: 0.1473 - Val Accuracy: 95.03%


100%|██████████| 1667/1667 [11:10:05<00:00, 24.12s/it]  
100%|██████████| 209/209 [38:46<00:00, 11.13s/it]


              precision    recall  f1-score   support

           0       1.00      0.90      0.95        10
           1       0.86      1.00      0.92         6

    accuracy                           0.94        16
   macro avg       0.93      0.95      0.94        16
weighted avg       0.95      0.94      0.94        16

Epoch 2/2 - Train Loss: 0.0685 - Val Loss: 0.1473 - Val Accuracy: 95.03%


In [None]:
## TEST

In [36]:
loaded_model = BertForSequenceClassification.from_pretrained(f"./fine_tuned_bert")
loaded_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [37]:
loaded_model.eval()

all_preds = []

with torch.no_grad():
  for text in tqdm(test_text):
      tokenized_sentence = tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
      input_ids = tokenized_sentence["input_ids"].to(device)
      attention_mask = tokenized_sentence["attention_mask"].to(device)

      outputs = loaded_model(input_ids, attention_mask=attention_mask)
      predictions = torch.argmax(outputs.logits, dim=1)
      all_preds.extend(predictions.cpu().numpy())

100%|██████████| 10000/10000 [09:45<00:00, 17.08it/s]


In [38]:
from sklearn.metrics import f1_score

f1_score(all_preds, test_y)

0.9528347758436253

In [None]:
#0.9528347758436253