<a href="https://colab.research.google.com/github/arnabd64/Transformers-NLP-Finetune/blob/main/notebooks/Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Classification

__Sentiment Classification__ is the task of classifying a text into either `positive`, `negative` or `neutral`. We shall be using the `AutoModelForSequenceClassification` class provided by the [Transformers](https://github.com/huggingface/transformers) library.

## Dataset

The dataset is hosted on Huggingface with repository ID: [mteb/amazon_polarity](https://huggingface.co/datasets/mteb/amazon_polarity). The dataset contains a total of 4 million amazon reviews which are labeled into two classes `positive` and `negative`.

## Base Model

We will be using `bert-tiny` model hosted on the [prajjwal1/bert-tiny](https://huggingface.co/prajjwal1/bert-tiny). It is a very small version of the BERT model. The model is only 17.8MB in size which makes it quite easy to finetune on a free Google Colab GPU.

# Install Libraries

In [None]:
! pip install --progress-bar=off \
    transformers[torch] \
    datasets \
    evaulate \
> install.log

# Login to Huggingface

In [None]:
HUGGINGFACE_TOKEN = "Paste your Huggingface Token"

import huggingface_hub
huggingface_hub.login(HUGGINGFACE_TOKEN)

# Label Map

In [None]:
LABELS = ["negative", "positive"]
LABEL2ID = {id:label for id, label in enumerate(LABELS)}
ID2LABEL = {id:label for id, label in enumerate(LABELS)}

# Encode the Labels

The label encoding for a huggingface dataset is done in a different way when compares to `pandas` or `scikit-learn`.

In [None]:
features = datasets.Features({
    "text": datasets.Vaklue("string"),
    "label_text": datasets.ClassLabel(num_classes=len(LABELS), names=LABELS)
})

# Load the Dataset

In [None]:
dataset = (
    datasets.load_dataset("mteb/amazon-polarity")
    .remove_columns(['label'])
    .cast(features)
)

# Load Tokenizer & Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
BASE_MODEL = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=len(LABELS), id2label=ID2LABEL, label2id=LABEL2ID)

# Tokenize the Dataset

In [None]:
def tokenize(batch):
    return tokenizer(
        batch['text'],
        truncation = True,
        padding = 'max_length',
        max_length = 64,
        return_tensors = "pt"
    )


dataset = dataset.map(tokenize, batched=True)

# Evaluation Metrics

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
cola = evaluate.load("glue", "cola")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    accuracy_metric = accuracy.compute(predictions=predictions, references=labels)
    cola_metric = cola.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy_metric["accuracy"], "matthews_correlation": cola_metric["matthews_correlation"]}

# Training Hyperparameters

In [None]:
from transformers import TrainingArguments

In [None]:
args = TrainingArguments(
    # Model Saving
    output_dir = "bert-tiny-amazon",
    save_safetensors = True,
    save_strategy = "steps",
    save_steps = 10_000,

    # Model Evaulation
    evaluation_strategy = "steps",
    eval_steps = 10_000,
    report_to = ['tensorboard'],
    load_best_model_at_end = True,
    metric_for_best_model = "matthews_correlation",

    # Optimzation
    max_steps = 1_000_000,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 320,
    learning_rate = 5e-5,
    weight_decay = 5e-9,
    warmup_ratio = 0.1,
    lr_scheduler_type = 'cosine',
)

# Initiate Trainer

In [None]:
trainer = Trainer(
    model = model,
    args = args,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],
    compute_metrics = compute_metrics
)

# Train the Model

In [None]:
output= trainer.train()

# Upload Model to Huggingface

In [None]:
trainer.push_to_hub()