### full training
* source: https://huggingface.co/learn/nlp-course/chapter3/4?fw=pt
  

* whatever we did in 3.3 now will perform same operation without using ```Trainer``` class

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
raw_datasets = load_dataset('glue', 'mrpc')

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenizer_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenizer_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [4]:
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### preparing for training
we need to apply a bit of postprocessing to our tokenized_datasets, to take care of some things that the Trainer did for us automatically. Specifically, we need to:
* Remove the columns corresponding to values the model does not expect (like the sentence1 and sentence2 columns).
* Rename the column label to labels (because the model expects the argument to be named labels).
* Set the format of the datasets so they return PyTorch tensors instead of lists.

* **Note:** tokenized_datasets has one method for each of these steps.

In [7]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')
tokenized_datasets['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [8]:
# the columns the model will accapt is:
# 'attention_mask', 'input_ids', 'labels', 'token_type_ids'

In [31]:
# define the dataloader
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn = data_collator)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], shuffle=True, batch_size=8, collate_fn=data_collator)


In [32]:
for data in tokenized_datasets['train']:
    print(data)
    break

{'labels': tensor(1), 'input_ids': tensor([  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
         2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
         3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
         1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
         2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1])}


In [33]:
for data in train_dataloader:
    print(data)
    break

{'labels': tensor([0, 1, 0, 0, 0, 1, 1, 1]), 'input_ids': tensor([[  101,  2019,  3063,  2040,  4993,  1037,  8275,  1000, 14046,  2659,
          3909,  9738,  1000,  3696,  2006,  1037,  2311,  2379,  2598,  5717,
          2056,  9857,  2008,  2002,  2106,  2025,  2812,  2000,  2125, 10497,
          3087,  1012,   102,  2019,  3063,  4993,  1037,  3696,  3752,  1036,
          1036, 14046,  2659,  3909,  9738,  1005,  1005,  2006,  1037,  2311,
          2379,  2598,  5717,  1010,  4963,  2075, 10638,  1998, 18385, 10821,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  4404,  2003,  2028,  1997,  2260,  2163,  2008,  2515,  2025,
          2031,  1996,  2331,  6531,  1010,  2383,  8961,  3007,  7750, 

In [34]:
for batch in train_dataloader:
    break
{k:v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 82]),
 'token_type_ids': torch.Size([8, 82]),
 'attention_mask': torch.Size([8, 82])}

### Model

In [13]:
from transformers import AutoModelForSequenceClassification
print(AutoModelForSequenceClassification)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.6083, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [15]:
#  AdamW, which is the same as Adam, but with a twist for weight decay regularization (see “Decoupled Weight Decay Regularization” by Ilya Loshchilov and Frank Hutter)

In [16]:
from transformers import AdamW

In [17]:
optimizer = AdamW(model.parameters(), lr=5e-5)



In [18]:
from transformers import get_scheduler

In [19]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

1377


### Training loop

In [20]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [21]:
from tqdm import tqdm
process_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss=outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        process_bar.update(1)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████████████████████████████████████████████████████████████████████████| 1377/1377 [02:40<00:00,  8.00it/s]

### Evaluation step

In [22]:
import evaluate

In [23]:
metric = evaluate.load('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
    batch = {k:v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

{'accuracy': 0.8725490196078431, 'f1': 0.9103448275862069}

### All at one place

using the 🤗 ```Accelerate library```, with just a few adjustments we can enable ```distributed training on multiple GPUs or TPUs```. Starting from the creation of the training and validation dataloaders, here is what our manual training loop looks like:

In [24]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda




  0%|                                                                                         | 0/1377 [00:45<?, ?it/s][A[A


  0%|                                                                                 | 1/1377 [00:00<06:36,  3.47it/s][A[A

  0%|                                                                                 | 2/1377 [00:00<04:08,  5.54it/s][A[A

  0%|▏                                                                                | 3/1377 [00:00<03:24,  6.72it/s][A[A

  0%|▎                                                                                | 5/1377 [00:00<02:41,  8.50it/s][A[A

  0%|▎                                                                                | 6/1377 [00:00<02:38,  8.67it/s][A[A

  1%|▍                                                                                | 7/1377 [00:00<02:38,  8.67it/s][A[A

  1%|▍                                                                                | 8/1377 [00:01<02:36,

### Accelerator

In [44]:
# same training code using accelerator
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

def training_function():
    accelerator = Accelerator()
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    optimizer = AdamW(model.parameters(), lr=3e-5)
    train_dl, eval_dl,model, optimizer = accelerator.prepare(
        train_dataloader,
        eval_dataloader,
        model,
        optimizer
    )
    
    num_epochs = 3
    num_training_steps = num_epochs * len(train_dl)
    lr_scheduler = get_scheduler(
        'linear',
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    progress_bar = tqdm(range(num_training_steps))
    
    model.train()
    for epoch in range(num_epochs):
        for batch in train_dl:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
    
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

In [45]:
#> !accelerate congig
#> !accelerate launch train.py

In [46]:
from accelerate import notebook_launcher
notebook_launcher(training_function, num_processes=1)

Launching training on one GPU.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  0%|                                                                                         | 0/1377 [00:00<?, ?it/s][A[A[A


  0%|                                                                               | 1/1377 [00:03<1:11:15,  3.11s/it][A[A[A


  0%|                                                                                 | 2/1377 [00:05<58:03,  2.53s/it][A[A[A


  0%|▏                                                                              | 3/1377 [00:08<1:01:42,  2.69s/it][A[A[A


  0%|▏                                                                              | 4/1377 [00:11<1:05:08,  2.85s/it][A[A[A


  0%|▎                                           

In [None]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')
model.eval()
eval_dataloader = accelerator.prepare(eval_dataloader)
for batch in eval_dataloader:
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=accelerator.gather(predictions), references=accelerator.gather(batch['labels']))
metric.compute()