## Sentiment Analysis with Transformer model & Fine Tuning

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification
)
from datasets import load_dataset
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- config ---

MODEL_NAME = 'distilbert-base-uncased'
BATCH_SIZE = 8
MAX_LEN = 256
LR = 5e-5
EPOCHS = 1
DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# --- load dataset ---

dataset = load_dataset('imdb')


In [4]:
# --- tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [5]:
# --- Tokenization Function ---

def tokenize_fn(batch):
    return tokenizer(
        batch['text'], 
        padding = 'max_length', 
        truncation = True, 
        max_length = MAX_LEN
    )

In [6]:
# --- Tokenize dataset ---

tokenized_dataset = dataset.map(
    tokenize_fn, batched = True, 
    remove_columns = ['text']
)

In [7]:
# --- Torch Format ---

tokenized_dataset.set_format(
    type = 'torch', 
    columns = ['input_ids', 'attention_mask', 'label']
)

In [8]:
# --- DataLoaders ---

train_loader = DataLoader(
    tokenized_dataset['train'], 
    batch_size = BATCH_SIZE, 
    shuffle = True
)

test_loader = DataLoader(
    tokenized_dataset['test'], 
    batch_size = BATCH_SIZE
)

In [9]:
# --- Model ---

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

model.to(DEVICE)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
# --- Optimizer ---

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR
)


In [11]:
# --- Training Loop ---

model.train()

for epoch in range(EPOCHS):
    loop = tqdm(train_loader, desc = f"Epoch {epoch+1}")

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)


        outputs = model(
            input_ids = input_ids, 
            attention_mask = attention_mask, 
            labels = labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss = loss.item())

Epoch 1: 100%|██████████| 3125/3125 [45:41<00:00,  1.14it/s, loss=0.0129] 


In [None]:
# --- Evaluation ---

model.eval()

correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

In [None]:
# --- Try our own sentence ---

model.eval()

texts = [
    "This movie was absolutely amazing",
    "I regret watching this film"
]

inputs = tokenizer(
    texts,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=MAX_LEN
).to(DEVICE)

with torch.no_grad():
    outputs = model(**inputs)

preds = torch.argmax(outputs.logits, dim=1)
print(preds.tolist())
