<a href="https://colab.research.google.com/github/alturkim/nlp-notebooks/blob/main/Sentiment_Analysis_with_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModel, AdamW, get_scheduler
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, arrow_dataset
from datasets.arrow_dataset import Dataset

import evaluate
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

import numpy as np

from tqdm.auto import tqdm

from typing import Tuple, Dict

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.linear = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(768, 2)
        self.criterion = nn.CrossEntropyLoss()
    def forward(self, features: torch.Tensor, labels: torch.Tensor):
        output = self.linear(features)
        output = nn.ReLU()(output)
        output = self.dropout(output)
        logits = self.classifier(output)

        loss = self.criterion(logits, labels)
        return loss, logits


In [None]:
def compute_metrics(eval_preds):
    # eval_preds is an EvalPrediction object which is a named tuple
    logits, labels = eval_preds
    predictions = torch.argmax(logits, axis=-1)

    metric_names = ["precision", "recall", "f1", "accuracy"]
    results = dict()
    for m in metric_names:
        metric = evaluate.load(m)
        results = {**results, **metric.compute(predictions=predictions, references=labels)}
    return results

class Trainer:
    def __init__(self, model: Classifier, embed_column: str, 
                 train_dataloader: DataLoader, eval_dataloader: DataLoader,
                 num_train_epochs: int, lr: float, device: torch.device):
        self.model = model
        self.embed_column = embed_column
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.num_train_epochs = num_train_epochs
        self.lr = lr
        self.device = device
        self.optimizer = AdamW(self.model.parameters(), lr=self.lr)

        self.model.to(self.device)

    def train(self) -> None:
        num_training_steps = self.num_train_epochs * len(self.train_dataloader)
        progress_bar = tqdm(range(num_training_steps))
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )
        for epoch in range(self.num_train_epochs):
            self.model.train()
            for batch in self.train_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                
                loss, logits = self.model(batch[self.embed_column], batch["labels"])
                loss.backward()
                self.optimizer.step()
                lr_scheduler.step()
                self.optimizer.zero_grad()
                progress_bar.update(1)

            # evaluate after each epoch
            print(f"evaluate ... epoch:{epoch}")
            results = self.evaluation()
            print(results)

    def evaluation(self) -> dict:
        self.model.eval()
        for batch in self.eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                _, logits = model(features=batch[self.embed_column], labels=batch["labels"])
            
            labels = batch["labels"]
            return compute_metrics((logits, labels))
        
    def predict(self, dataset) -> dict:
        self.model.eval()
        dataloader = DataLoader(
            dataset, 
            shuffle=True, 
            batch_size=8)
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                _, logits = model(features=batch[self.embed_column], labels=batch["labels"])
            
            labels = batch["labels"]
            return compute_metrics((logits, labels))

In [None]:
def tokenize_and_embed(batch: arrow_dataset.Dataset):
    processed = tokenizer(batch["text"], truncation=True,
                          padding="max_length", return_tensors="pt")
    with torch.no_grad():
        embeddings = lm(input_ids=processed["input_ids"].to(device),
                        attention_mask=processed["attention_mask"].to(device))
    cls_embd = embeddings.last_hidden_state[:, 0]
    processed["cls_embed"] = cls_embd

    processed.pop("input_ids")
    processed.pop("attention_mask")
    return processed

In [None]:
def train_eval_test_split(dataset : Dataset) -> Dict[str, Dataset]:
    split_datasets = dict()
    train_eval_test = dataset.train_test_split(test_size=0.4, stratify_by_column="labels", seed=10)
    split_datasets["train"] = train_eval_test["train"]
    eval_test = train_eval_test["test"].train_test_split(test_size=0.5, stratify_by_column="labels", seed=10)
    split_datasets["eval"] = eval_test["train"]
    split_datasets["test"] = eval_test["test"]
    return split_datasets

In [None]:
def get_stat(dataset : Dataset) -> None:
    labels = dataset["labels"]
    pos_count = sum([1 for i in labels if i==1])
    neg_count = sum([1 for i in labels if i==0])

    pos_pct = pos_count/(pos_count + neg_count)
    neg_pct = neg_count/(pos_count + neg_count)
    print(f"There are: \n{pos_count} positive reviews.\n{neg_count} negative reviews.")
    print(f"Percentage of positive reviews: {pos_pct*100:.2f}%")
    print(f"Percentage of negative reviews: {neg_pct*100:.2f}%")

In [None]:
if __name__ == "__main__":
    lm_checkpoint = "distilbert-base-multilingual-cased"
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    raw_dataset = load_dataset("ar_res_reviews", split="train").rename_column("polarity", "labels")

    split_datasets = train_eval_test_split(raw_dataset)

    for split, data in split_datasets.items():
        print(f"{split} data stats")
        get_stat(data)
        print()

    tokenizer = AutoTokenizer.from_pretrained(lm_checkpoint)
    lm = AutoModel.from_pretrained(lm_checkpoint)

    lm.to(device)

    tokenized_datasets = dict()
    for split in ["train", "eval", "test"]:
        tokenized_datasets[split] = split_datasets[split].map(tokenize_and_embed, batched=True, batch_size=8)
        tokenized_datasets[split] = tokenized_datasets[split].remove_columns(["text", "restaurant_id", "user_id"])
        tokenized_datasets[split] = tokenized_datasets[split].with_format("torch")

    train_dataloader = DataLoader(
        tokenized_datasets["train"], 
        shuffle=True,
        batch_size=8)
    
    eval_dataloader = DataLoader(
        tokenized_datasets["eval"], 
        shuffle=True, 
        batch_size=8)


Downloading builder script:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.81k [00:00<?, ?B/s]

Downloading and preparing dataset ar_res_reviews/default to /root/.cache/huggingface/datasets/ar_res_reviews/default/0.0.0/f303714dc96c8056d45dca8950e5b7fe6ad59b88d0c095e07724e0484824031c...


Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8364 [00:00<?, ? examples/s]

Dataset ar_res_reviews downloaded and prepared to /root/.cache/huggingface/datasets/ar_res_reviews/default/0.0.0/f303714dc96c8056d45dca8950e5b7fe6ad59b88d0c095e07724e0484824031c. Subsequent calls will reuse this data.
train data stats
There are: 
3567 positive reviews.
1451 negative reviews.
Percentage of positive reviews: 71.08%
Percentage of negative reviews: 28.92%

eval data stats
There are: 
1190 positive reviews.
483 negative reviews.
Percentage of positive reviews: 71.13%
Percentage of negative reviews: 28.87%

test data stats
There are: 
1189 positive reviews.
484 negative reviews.
Percentage of positive reviews: 71.07%
Percentage of negative reviews: 28.93%



Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/628 [00:00<?, ?ba/s]

  0%|          | 0/210 [00:00<?, ?ba/s]

  0%|          | 0/210 [00:00<?, ?ba/s]

In [None]:
if __name__ == "__main__":
    model = Classifier()
    trainer = Trainer(model=model, 
                      embed_column="cls_embed", 
                      train_dataloader=train_dataloader, 
                      eval_dataloader=eval_dataloader,
                      num_train_epochs=5, 
                      lr=5e-5,
                      device=device)
    print("Evaluating before training ... ")
    result = trainer.predict(tokenized_datasets["test"])
    print(result)
    
    print("training ... ")
    trainer.train()

    print("Evaluating after training ... ")
    result = trainer.predict(tokenized_datasets["test"])
    print(result)
    



Evaluating before training ... 


Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'precision': 0.875, 'recall': 1.0, 'f1': 0.9333333333333333, 'accuracy': 0.875}
training ... 


  0%|          | 0/3140 [00:00<?, ?it/s]

evaluate ... epoch:0
{'precision': 0.625, 'recall': 1.0, 'f1': 0.7692307692307693, 'accuracy': 0.625}
evaluate ... epoch:1
{'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571, 'accuracy': 0.75}
evaluate ... epoch:2
{'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571, 'accuracy': 0.75}
evaluate ... epoch:3
{'precision': 0.8571428571428571, 'recall': 1.0, 'f1': 0.923076923076923, 'accuracy': 0.875}
evaluate ... epoch:4
{'precision': 0.75, 'recall': 1.0, 'f1': 0.8571428571428571, 'accuracy': 0.75}
Evaluating after training ... 
{'precision': 0.875, 'recall': 1.0, 'f1': 0.9333333333333333, 'accuracy': 0.875}
