### **Fully Training a model without using the Trainer class of PyTorch.**

**Preprocessing**

In [1]:
# Automatic preprocessing using Trainer
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# truncate the sequences input:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# map the raw datasets to tokenized function. Truncate them.
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Data collator that will dynamically pad the inputs received.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 

In [2]:
# We process the tokenized_datasets further to remove the columns we don't need and rename the label column to labels.
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

# Our model will only accept the following:
# ["attention_mask", "input_ids", "labels", "token_type_ids"]

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [3]:
# Define the DataLoaders 
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [4]:
# Double check for mistake in data processing:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 78]),
 'token_type_ids': torch.Size([8, 78]),
 'attention_mask': torch.Size([8, 78])}

**Call model**

In [5]:
# Call the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# To make sure that everything will go smoothly during training, we pass our batch to this model:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(0.6766, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [6]:
# We import an optimizer and a learning rate scheduler.
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



*We properly define our learning rate scheduler here:*

In [7]:
from transformers import get_scheduler

num_epochs = 3 # arbitrary
num_training_steps = num_epochs * len(train_dataloader) # formulaic approach to num of training steps
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1377


**The Training Loop**

In [8]:
# define device 
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [9]:
# we add a progress bar over our number of training steps, using the tqdm library:
# Training the model
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

**Evaluation Loop**

In [10]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8382352941176471, 'f1': 0.8850174216027874}

**Using the Accelerate Library to supercharge the training loop**      
The training loop we defined earlier works fine on a single CPU or GPU. But using the 🤗 Accelerate library, with just a few adjustments we can enable distributed training on multiple GPUs or TPUs. Starting from the creation of the training and validation dataloaders, here is what our manual training loop looks like:

In [11]:
# Accelerate handles the device placement for you, so you can remove the lines that put the model on the device 
# (or, if you prefer, change them to use accelerator.device instead of device)

# You will see that the time taken for this to run is almost the same as the time taken for the previous section. 
# This is because this code is intended for more than one gpu device.
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)


num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]