In [1]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
sequence = [
    "Hey, I am Anand.",
    "I am learning trainer."
]
batch = tokenizer(sequence,padding=True,truncation=True,return_tensors="pt")
batch["labels"] = torch.tensor([1,1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

### padding=True
- This adds padding tokens so that all sequences in a batch are the same length.
- Transformers expect input of fixed size.
- If you input multiple sequences (e.g., a list), the tokenizer finds the longest one and pads the others to match.
- Padding tokens usually have a special ID like 0.
Without padding:

[101, 7592, 999, 102]
[101, 2054, 2024, 2017, 102]
With padding:

[101, 7592, 999, 102, 0, 0]
[101, 2054, 2024, 2017, 102]


## truncation=True
- This cuts off sequences that are too long for the model’s maximum input length (usually 512 tokens for BERT).

- Prevents input from exceeding the model's limits.

- Truncation occurs from the end by default (but you can customize).

Ex:
- "Hello world" → OK
- "A very very very very very ... long sentence" → will be cut off


In [5]:
from datasets import load_dataset
raw_datasets = load_dataset("glue","mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

This works well, but it has the disadvantage of returning a dictionary (with our keys, input_ids, attention_mask, and token_type_ids, and values that are lists of lists)

In [7]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [8]:
tokenized_dataset.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
tokenized_dataset.input_ids

[[101,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1010,
  3183,
  2002,
  2170,
  1000,
  1996,
  7409,
  1000,
  1010,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  7727,
  2000,
  2032,
  2004,
  2069,
  1000,
  1996,
  7409,
  1000,
  1010,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [101,
  9805,
  3540,
  11514,
  2050,
  3079,
  11282,
  2243,
  1005,
  1055,
  2077,
  4855,
  1996,
  4677,
  2000,
  3647,
  4576,
  1999,
  2687,
  2005,
  1002,
  1016,
  1012,
  1019,
  4551,
  1012,
  102,
  9805,
  3540,
  11514,
  2050,
  4149,
  11282,
  2243,
  1005,
  1055,
  1999,
  2786,
  2005,
  1002,
  6353,
  250

To keep the data as a dataset, we will use the Dataset.map() method. This also allows us some extra flexibility, if we need more preprocessing done than just tokenization.

In [10]:
def tokenize_function(example):
    return tokenizer(example["sentence1"],example["sentence2"],truncation=True)

In [11]:
tokenized_datasets = raw_datasets.map(tokenize_function,batched=True)
tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map: 100%|██████████| 408/408 [00:00<00:00, 6577.86 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

| Feature           | Padding (`max_length`) | Dynamic Padding (`DataCollatorWithPadding`) |
| ----------------- | ---------------------- | ------------------------------------------- |
| Padding Length    | Fixed                  | Varies per batch                            |
| Memory Efficiency | Low (if varied input)  | High                                        |
| Setup             | Simple                 | Slightly more complex                       |
| Recommended Use   | Static batches         | Training & inference for variable lengths   |


## Fixed Padding (e.g., pad to 8 tokens)
Both sequences are padded to 8 tokens, so the attention masks would look like this:
Text 1: [Hello, world, PAD, PAD, PAD, PAD, PAD, PAD]
Mask  : [  1  ,   1 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ]

Text 2: [Hugging, Face, makes, great, models, PAD, PAD, PAD]
Mask  : [   1  ,  1  ,   1 ,   1 ,    1 ,  0 ,  0 ,  0 ]

## Dynamic Padding (pad to longest in batch)
If we batch Text 1 and Text 2 together, the longest is 6 tokens, so we pad to 6:
Text 1: [Hello, world, PAD, PAD, PAD, PAD]
Mask  : [  1  ,   1 ,  0 ,  0 ,  0 ,  0 ]

Text 2: [Hugging, Face, makes, great, models, PAD]
Mask  : [   1  ,  1  ,   1 ,   1 ,    1 ,  0 ]


| Strategy    | Attention Mask Example for Text 1 | Padding Length |
| ----------- | --------------------------------- | -------------- |
| **Fixed**   | `[1, 1, 0, 0, 0, 0, 0, 0]`        | Always 8       |
| **Dynamic** | `[1, 1, 0, 0, 0, 0]`              | Just enough    |


The function that is responsible for putting together samples inside a batch is called a ```collate function```. It’s an argument you can pass when you build a DataLoader, the default being a function that will just convert your samples to PyTorch tensors and concatenate them (recursively if your elements are lists, tuples, or dictionaries).

In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

 Transformers provides a ```Trainer``` class to help you fine-tune any of the pretrained models it provides on your dataset

## Training
The first step before we can define our Trainer is to define a ```TrainingArguments``` class that will contain all the hyperparameters the Trainer will use for training and evaluation

In [13]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    # processing_class=tokenizer
)

In [17]:
trainer.train()

Step,Training Loss
500,0.5802
1000,0.3932


TrainOutput(global_step=1377, training_loss=0.42202173702597356, metrics={'train_runtime': 366.3501, 'train_samples_per_second': 30.037, 'train_steps_per_second': 3.759, 'total_flos': 405114969714960.0, 'train_loss': 0.42202173702597356, 'epoch': 3.0})

## Evaluation

- The output of the ```predict()``` method is another named tuple with three fields: `predictions`, `label_ids`, and `metrics`. 
- The metrics field will just contain the loss on the dataset passed, as well as some time metrics (how long it took to predict, in total and on average). 
- Once we complete our `compute_metrics()` function and pass it to the Trainer, that field will also contain the metrics returned by compute_metrics().

In [20]:
predictions = trainer.predict(tokenized_datasets["validation"])

As you can see, predictions is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used).

In [26]:
print(predictions.predictions.shape)
print(predictions.label_ids)

(408, 2)
[1 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0
 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0
 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1
 0 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 0 1 1 0 1 0 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0
 1]


In [27]:
import numpy as np
preds = np.argmax(predictions.predictions,axis=-1)

In [30]:
import evaluate
metric = evaluate.load("glue","mrpc")
metric.compute(predictions=preds,references=predictions.label_ids)

Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 5.78MB/s]


{'accuracy': 0.8504901960784313, 'f1': 0.8927943760984183}

Logits are the raw, unnormalized predictions that a machine learning model (especially a neural network) outputs before applying an activation function like softmax or sigmoid.

In [31]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue","mrpc")
    logits,labels = eval_preds
    predictions = np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)
    

In [34]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [35]:
trainer.train()

Step,Training Loss
500,0.2884
1000,0.1356


TrainOutput(global_step=1377, training_loss=0.17665886861652938, metrics={'train_runtime': 218.6617, 'train_samples_per_second': 50.324, 'train_steps_per_second': 6.297, 'total_flos': 405114969714960.0, 'train_loss': 0.17665886861652938, 'epoch': 3.0})