<a href="https://colab.research.google.com/github/amir-asari/Introduction_to_Huggingface/blob/main/3_FineTuningaPretrainedModel/4_Fine_Tuning_Models_with_DIY_Training_Loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pre-requisite Setup for DIY Training Loop

In [None]:
!pip install datasets torchinfo accelerate evaluate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.ma

In [None]:
import torch
import datasets     as huggingface_datasets     # Easy readability
import transformers as huggingface_transformers # Easy readability
import accelerate   as huggingface_accelerate   # Easy readability
import torchinfo

from tqdm.auto import tqdm

In [None]:
raw_datasets  = huggingface_datasets.load_dataset("glue", "mrpc")
checkpoint    = "bert-base-uncased"
tokenizer     = huggingface_transformers.AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets  = raw_datasets.map(tokenize_function, batched=True)
data_collator       = huggingface_transformers.DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
train_dataloader = torch.utils.data.DataLoader(tokenized_datasets["train"], shuffle=True,  batch_size=8, collate_fn=data_collator)
eval_dataloader  = torch.utils.data.DataLoader(tokenized_datasets["validation"],           batch_size=8, collate_fn=data_collator)

In [None]:
model = huggingface_transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, force_download= True)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# TEST end to end batch through the model
batch   = next(iter(train_dataloader))
outputs = model(**batch)

print(outputs.loss, outputs.logits.shape)

tensor(0.9850, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


## The Training Loop

In [None]:
torchinfo.summary(model, col_names = ["num_params", "params_percent", "trainable"], depth = 4, row_settings = ["var_names"], verbose = 1);
# verbose = 2 is important. Understanding internal

Layer (type (var_name))                                                Param #                   Param %                   Trainable
BertForSequenceClassification (BertForSequenceClassification)          --                             --                   True
├─BertModel (bert)                                                     --                             --                   True
│    └─BertEmbeddings (embeddings)                                     --                             --                   True
│    │    └─Embedding (word_embeddings)                                23,440,896                 21.41%                   True
│    │    └─Embedding (position_embeddings)                            393,216                     0.36%                   True
│    │    └─Embedding (token_type_embeddings)                          1,536                       0.00%                   True
│    │    └─LayerNorm (LayerNorm)                                      1,536                       

In [None]:
optimizer           = huggingface_transformers.AdamW(model.parameters(), lr=5e-5)
num_epochs          = 3
num_training_steps  = num_epochs * len(train_dataloader)
progress_bar        = tqdm(range(num_training_steps))

lr_scheduler        = huggingface_transformers.get_scheduler("linear", optimizer=optimizer, num_training_steps=num_training_steps, num_warmup_steps= 0)
device              = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

model.train()
batch_no = 0
for single_batch in train_dataloader:
    single_batch = {k: v.to(device) for k, v in single_batch.items()} # Sending data to the Device. GPU makes it run very fast, cpu makes it run very very very slow.

    outputs_predicted = model(**single_batch)
    error_value       = outputs_predicted.loss
    error_value       .backward() # MOST IMP STAGE OF DL.
    # dError_value
    #   /
    # dParameter - FOR EACH PARAMETER
    # Relationship between Parameter change & Error Value of Prediction
    """
    # FOR EACH PARAMETER
    for parameter in model.parameters():
      parameter = parameter - parameter.grad * 0.001
    """
    optimizer     .step()
    optimizer     .zero_grad()    # clear previous batch's gradient values. parameter.grad = 0
    lr_scheduler  .step()

    progress_bar  .update(1)
    batch_no = batch_no + 1



  0%|          | 0/1377 [00:00<?, ?it/s]

## Evaluate

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8651960784313726, 'f1': 0.9069373942470389}

## Accelerate

In [None]:
model               = huggingface_transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer           = huggingface_transformers.AdamW(model.parameters(), lr=5e-5)
num_epochs          = 3
num_training_steps  = num_epochs * len(train_dataloader)
progress_bar        = tqdm(range(num_training_steps))
lr_scheduler        = huggingface_transformers.get_scheduler("linear", optimizer=optimizer, num_training_steps=num_training_steps, num_warmup_steps= 0)

accelerator         = huggingface_accelerate.Accelerator()
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( train_dataloader, eval_dataloader, model, optimizer)

def training_loop_DIY():
    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:

            outputs_predicted = model(**single_batch)
            error_value       = outputs_predicted.loss
            accelerator.backward(error_value)
            # dError_value
            #   /
            # dParameter
            # Relationship between Parameter change & Error Value of Prediction
            """
            for name, parameter in model.named_parameters():
              parameter = parameter - parameter.grad * 0.001
            """
            optimizer     .step()
            optimizer     .zero_grad()    # clear previous batch's gradient values. parameter.grad = 0
            lr_scheduler  .step()
            progress_bar  .update(1)
training_loop_DIY()



  0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
from accelerate import notebook_launcher

notebook_launcher(training_loop_DIY)

Launching training on one CPU.


KeyboardInterrupt: 