In [None]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
sequence = [
    "Hey, I am Anand.",
    "I am learning trainer."
]
batch = tokenizer(sequence,padding=True,truncation=True,return_tensors="pt")
batch["labels"] = torch.tensor([1,1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

### padding=True
- This adds padding tokens so that all sequences in a batch are the same length.
- Transformers expect input of fixed size.
- If you input multiple sequences (e.g., a list), the tokenizer finds the longest one and pads the others to match.
- Padding tokens usually have a special ID like 0.
Without padding:

[101, 7592, 999, 102]
[101, 2054, 2024, 2017, 102]
With padding:

[101, 7592, 999, 102, 0, 0]
[101, 2054, 2024, 2017, 102]


## truncation=True
- This cuts off sequences that are too long for the model’s maximum input length (usually 512 tokens for BERT).

- Prevents input from exceeding the model's limits.

- Truncation occurs from the end by default (but you can customize).

Ex:
- "Hello world" → OK
- "A very very very very very ... long sentence" → will be cut off


In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue","mrpc")
raw_datasets

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset.features

This works well, but it has the disadvantage of returning a dictionary (with our keys, input_ids, attention_mask, and token_type_ids, and values that are lists of lists)

In [None]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [None]:
tokenized_dataset.keys()

In [None]:
tokenized_dataset.input_ids

To keep the data as a dataset, we will use the Dataset.map() method. This also allows us some extra flexibility, if we need more preprocessing done than just tokenization.

In [None]:
def tokenize_function(example):
    return tokenizer(example["sentence1"],example["sentence2"],truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function,batched=True)
tokenized_datasets

| Feature           | Padding (`max_length`) | Dynamic Padding (`DataCollatorWithPadding`) |
| ----------------- | ---------------------- | ------------------------------------------- |
| Padding Length    | Fixed                  | Varies per batch                            |
| Memory Efficiency | Low (if varied input)  | High                                        |
| Setup             | Simple                 | Slightly more complex                       |
| Recommended Use   | Static batches         | Training & inference for variable lengths   |


## Fixed Padding (e.g., pad to 8 tokens)
Both sequences are padded to 8 tokens, so the attention masks would look like this:
Text 1: [Hello, world, PAD, PAD, PAD, PAD, PAD, PAD]
Mask  : [  1  ,   1 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ]

Text 2: [Hugging, Face, makes, great, models, PAD, PAD, PAD]
Mask  : [   1  ,  1  ,   1 ,   1 ,    1 ,  0 ,  0 ,  0 ]

## Dynamic Padding (pad to longest in batch)
If we batch Text 1 and Text 2 together, the longest is 6 tokens, so we pad to 6:
Text 1: [Hello, world, PAD, PAD, PAD, PAD]
Mask  : [  1  ,   1 ,  0 ,  0 ,  0 ,  0 ]

Text 2: [Hugging, Face, makes, great, models, PAD]
Mask  : [   1  ,  1  ,   1 ,   1 ,    1 ,  0 ]


| Strategy    | Attention Mask Example for Text 1 | Padding Length |
| ----------- | --------------------------------- | -------------- |
| **Fixed**   | `[1, 1, 0, 0, 0, 0, 0, 0]`        | Always 8       |
| **Dynamic** | `[1, 1, 0, 0, 0, 0]`              | Just enough    |


The function that is responsible for putting together samples inside a batch is called a ```collate function```. It’s an argument you can pass when you build a DataLoader, the default being a function that will just convert your samples to PyTorch tensors and concatenate them (recursively if your elements are lists, tuples, or dictionaries).

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 Transformers provides a ```Trainer``` class to help you fine-tune any of the pretrained models it provides on your dataset

## Training
The first step before we can define our Trainer is to define a ```TrainingArguments``` class that will contain all the hyperparameters the Trainer will use for training and evaluation

In [None]:
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",                   # Where to save model/checkpoints
#     evaluation_strategy="epoch",              # When to evaluate: "no", "steps", or "epoch"
#     per_device_train_batch_size=16,           # Batch size for training
#     per_device_eval_batch_size=16,            # Batch size for evaluation
#     num_train_epochs=3,                       # Total number of training epochs
#     weight_decay=0.01,                        # Weight decay for regularization
#     logging_dir="./logs",                     # Directory for storing logs
#     logging_steps=50,                         # Log every X steps
#     save_strategy="epoch",                    # Save checkpoints every epoch
#     load_best_model_at_end=True,              # Keep best model according to eval_metric
#     metric_for_best_model="accuracy",         # Metric to determine "best" model
#     save_total_limit=2                        # Keep only the last 2 models
# )


| Parameter                     | Purpose                                            | Recommended/Example        |
| ----------------------------- | -------------------------------------------------- | -------------------------- |
| `output_dir`                  | Directory to save checkpoints and final model      | `"./results"`              |
| `num_train_epochs`            | How many times the model will see the full dataset | `3` to `5` is common       |
| `per_device_train_batch_size` | Training batch size *per GPU*                      | `8`–`32`                   |
| `evaluation_strategy`         | When to evaluate (can be `"epoch"` or `"steps"`)   | `"epoch"` is common        |
| `save_strategy`               | When to save model (same options as above)         | `"epoch"`                  |
| `learning_rate`               | Step size for gradient updates                     | Optional: `2e-5` to `5e-5` |
| `weight_decay`                | L2 regularization to prevent overfitting           | `0.01`                     |
| `logging_dir`                 | For TensorBoard logs                               | `"./logs"`                 |
| `load_best_model_at_end`      | Automatically restore best checkpoint              | `True`                     |
| `metric_for_best_model`       | Metric used for `load_best_model_at_end`           | e.g. `"accuracy"`          |


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    # processing_class=tokenizer
)

In [None]:
trainer.train()

## Evaluation

- The output of the ```predict()``` method is another named tuple with three fields: `predictions`, `label_ids`, and `metrics`. 
- The metrics field will just contain the loss on the dataset passed, as well as some time metrics (how long it took to predict, in total and on average). 
- Once we complete our `compute_metrics()` function and pass it to the Trainer, that field will also contain the metrics returned by compute_metrics().

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])

As you can see, predictions is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used).

In [None]:
print(predictions.predictions.shape)
print(predictions.label_ids)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions,axis=-1)

In [None]:
import evaluate
metric = evaluate.load("glue","mrpc")
metric.compute(predictions=preds,references=predictions.label_ids)

Logits are the raw, unnormalized predictions that a machine learning model (especially a neural network) outputs before applying an activation function like softmax or sigmoid.

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue","mrpc")
    logits,labels = eval_preds
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)
    

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()