## Prep

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [5]:
raw_datasets = load_dataset("glue", "mrpc") ## Load Data
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) ## Load Tokenizer
## Define Tokenizer function
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) ## Tokenizer everything
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) ## Dynamic Padding

Reusing dataset glue (/home/aaagraw/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
Loading cached processed dataset at /home/aaagraw/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-b4909a3b6343cfb5.arrow
Loading cached processed dataset at /home/aaagraw/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-682d18abe3217505.arrow
Loading cached processed dataset at /home/aaagraw/.cache/huggingface/datasets/glue/mrpc/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-43276922819a4d3f.arrow


- Remove the columns corresponding to values the model does not expect (like the sentence1 and sentence2 columns).
- Rename the column label to labels (because the model expects the argument to be named labels).
- Set the format of the datasets so they return PyTorch tensors instead of lists.

In [6]:
tokenized_datasets.remove_columns_(['sentence1', 'sentence2', 'idx'])
tokenized_datasets.rename_column_('label', 'labels')
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['attention_mask', 'input_ids', 'labels', 'token_type_ids']

## Defining Data Loaders

In [7]:
from torch.utils.data import DataLoader

In [8]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)

eval_dataloader  = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [9]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

  return torch.tensor(x, **format_kwargs)


{'attention_mask': torch.Size([8, 78]),
 'input_ids': torch.Size([8, 78]),
 'labels': torch.Size([8]),
 'token_type_ids': torch.Size([8, 78])}

## Loading Model

In [10]:
from transformers import AutoModelForSequenceClassification

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
## Check
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.4769, grad_fn=<NllLossBackward>) torch.Size([8, 2])


## Defining Optimizer

In [15]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr =5e-5)

Finally, the learning rate scheduler used by default is just a linear decay from the maximum value (5e-5) to 0. To properly define it, we need to know the number of training steps we will take, which is the number of epochs we want to run multiplied by the number of training batches (which is the length of our training dataloader). The Trainer uses three epochs by default, so we will follow that:

In [16]:
from transformers import get_scheduler

In [17]:
num_epochs=3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", 
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

1377


## The training loop

Sending model to GPU

In [18]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

We are now ready to train! To get some sense of when training will be finished, we add a progress bar over our number of training steps, using the tqdm library:

In [19]:
from tqdm.auto import tqdm

In [21]:
progress_bar = tqdm(range(num_training_steps))

model.train() ## setting model in training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch ={k:v.to(device) for k,v in batch.items()} ## sending tensors to GPU
        outputs = model(**batch) ## compute output which contains loss and logits
        loss = outputs.loss ## get loss
        loss.backward() ## compute gradients
        optimizer.step() ## Update gradients
        lr_scheduler.step() ## Update learning rate
        optimizer.zero_grad() ## Make model grads zero again
        progress_bar.update(1) ## tqdm update

HBox(children=(FloatProgress(value=0.0, max=1377.0), HTML(value='')))

This will not provide us with any insights into what our model is doing.

## The evaluation loop

In [24]:
from datasets import load_metric

metric = load_metric("glue","mrpc") ## Getting required metric for the task
model.eval() ## Putting model in evaluation mode
for batch in eval_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()} ## Getting batch on GPU
    with torch.no_grad(): ## This doesn't require a gradient update
        outputs = model(**batch) ## Get outputs that contain loss and logits
    
    logits = outputs.logits ## Get logits
    predictions = torch.argmax(logits, axis=1) ## Get predictions
    metric.add_batch(predictions = predictions, references=batch["labels"]) ## Get metric : batch update
    
metric.compute()  ## Comput final metric

{'accuracy': 0.8651960784313726, 'f1': 0.9046793760831888}