<a href="https://colab.research.google.com/github/andygma567/AutoFreeze-experiment/blob/main/AF_tests_4_profiling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [None]:
!pip install -q transformers pytorch-lightning datasets evaluate wandb

In [None]:
# HuggingFace
import transformers
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
    )
import evaluate

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import (
    Timer,
    BatchSizeFinder,
    ModelCheckpoint,
    )
from pytorch_lightning.loggers import WandbLogger

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
import torch.utils.data as data
from torch.utils.data import random_split, DataLoader

# For logging and checkpointing
import os
from datetime import timedelta
import wandb

# for parsing strings
import re

# numpy
import numpy as np


In [None]:
# loading takes about 2-3 min
raw_dataset = load_dataset("amazon_polarity")
# dataset = load_dataset("yelp_review_full")



  0%|          | 0/2 [00:00<?, ?it/s]

# Make a sample dataset

In [None]:
# I'd like to set up a smaller data dictionary sample to run data through
sample_ds_dict = DatasetDict(
    {
    'train': raw_dataset['train'].select(range(100)),
    'test':raw_dataset['test'].select(range(20))
    }
)
# print(sample_ds_dict)
sample_ds_dict.save_to_disk("sample_ds_dict")

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

# Set up a datamodule

https://pytorch-lightning.readthedocs.io/en/latest/data/datamodule.html#lightningdatamodule-api

These are the links I used to combine data_collators with the data_loader:

*   https://huggingface.co/course/chapter3/3?fw=pt
*   https://huggingface.co/course/chapter7/2?fw=pt#a-custom-training-loop
*   https://huggingface.co/course/chapter2/5?fw=pt#padding-the-inputs

This link is to the collate function:

*   https://pytorch.org/docs/stable/data.html



In [None]:
class amazonDataModule(pl.LightningDataModule):
    def __init__(
        self,
        model_checkpoint: str = "bert-base-cased",
        data_dir: str = "amazon_polarity",
        max_seq_length: int = 512,
        batch_size: int = 8, # 40 was found via the trainer.tune for batch_size
        num_labels=2, # for passing to the lightning module later
      ):
        super().__init__()
        self.save_hyperparameters()

        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_checkpoint)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

    # This is a tokenizer function to be used in the prepare_data method
    def tokenize_function(self, examples):
        title_and_content = [ title + ': ' + content for title, content in zip(examples['title'], examples['content'])]
        model_inputs = self.tokenizer(
        title_and_content,
        max_length=self.hparams.max_seq_length,
        truncation=True,
    )
        return model_inputs

    # load, tokenize, and save the tokenized dataset to disk
    # I read that it's better to tokenize on a single processor
    def prepare_data(self):

        # check if the dataset is from the Huggingface or if this is a sample dataset
        if self.hparams.data_dir == "amazon_polarity":
            raw_dataset = load_dataset(self.hparams.data_dir)
        else:
            raw_dataset = load_from_disk(self.hparams.data_dir)

        # tokenize using HuggingFace
        tokenized_ds = raw_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=['title', 'content'])
        tokenized_ds.save_to_disk(self.hparams.data_dir + "_tokenized")

    def setup(self, stage: str):
        # check if a tokenized dataset already exists
        if not os.path.isdir(self.hparams.data_dir + "_tokenized"):
            print("The data needs to be tokenized first.")
            return
        else:
            # load the tokenized dataset from disk
            tokenized_ds = load_from_disk(self.hparams.data_dir + "_tokenized")
            # Assign train/val datasets for use in dataloaders
            if stage == 'fit':
                self.tokenized_val, self.tokenized_train = random_split(tokenized_ds['train'], [0.2,0.8], generator=torch.Generator().manual_seed(42))
            # Assign test dataset for use in dataloader(s)
            if stage == "test":
                self.tokenized_test = tokenized_ds['test']

    # return a dataloader and use a hugging face data collator for the collate_fn
    def train_dataloader(self):
        return DataLoader(self.tokenized_train, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)

    def val_dataloader(self):
        return DataLoader(self.tokenized_val, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)

    def test_dataloader(self):
        return DataLoader(self.tokenized_test, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)


## Instantiate a datamodule



In [None]:
dm = amazonDataModule(data_dir="/content/sample_ds_dict")

In [None]:
dm.prepare_data()



Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
dm.setup(stage="fit")

# Make a Lightning Module

[PL transformers example](https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/text-transformers.html#Transformer-LightningModule)

[HF evaluation metrics](https://huggingface.co/docs/evaluate/a_quick_tour)

It's usually good to use .detach().to(self.device).numpy() to get a numpy array or use .item() / .tolist() to make a python float / list on CPU. Here are some links regarding this:
* [7. Remove any .cuda() or .to(device) Calls](https://pytorch-lightning.readthedocs.io/en/latest/starter/converting.html#remove-any-cuda-or-to-device-calls)

[BertForSequenceClassification that I use](https://huggingface.co/docs/transformers/v4.26.0/en/model_doc/bert#transformers.BertForSequenceClassification
)

[2nd example of a training loop](https://huggingface.co/course/chapter3/4?fw=pt#the-training-loop)

[PL automatic logging docs](https://pytorch-lightning.readthedocs.io/en/latest/extensions/logging.html#automatic-logging)


For now, I'm adjusting the scheduler to assume everything runs for 50 epochs for the sake of testing

In [None]:
# This model only logs metrics on the validation epoch
class pl_BERT(pl.LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        learning_rate: float = 2e-5,
        # adam_epsilon: float = 1e-8,
        warmup_steps: int = 5,
        weight_decay: float = 1e-5,
        batch_size = 8, # it shows an error if the batch size doesn't match with the datamodule batch_size
        # train_batch_size: int = 32,
        # eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.save_hyperparameters()
        # Alternatively one can make and pass in a config object
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path,
            num_labels=self.hparams.num_labels)
        # metrics
        self.clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        # log the losses
        # self.log("train_loss", loss, prog_bar=True)
        return loss

    # Instatiate a new metric every validation run
    def on_validation_epoch_start(self):
        self.clf_metrics = self.clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
        return

    # compute the val_loss and load the evaluator for the epoch metrics
    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        val_loss, logits = outputs.loss, outputs.logits
        # pass everything as python lists
        preds = torch.argmax(outputs.logits, dim=-1).tolist()
        refs = batch.labels.tolist()
        # load the predications into the evaluator
        self.clf_metrics.add_batch(references=refs, predictions=preds)
        # log the results - the default only logs val epoch loss
        self.log("val_loss", val_loss, prog_bar=True)
        # no returns needed
        return

    # log or print the metrics for each validation epoch
    def on_validation_epoch_end(self):
        split_metrics = {
            f"val_{k}": v for k, v in self.clf_metrics.compute().items()
        }
        self.log_dict(split_metrics, prog_bar=True, logger=True)
        return

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        # I got this from HF/ lightning. We don't want the weight decay to change the bias or the layer normalization
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() \
                    if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
                "name": "decay"
            },
            {
                "params": [p for n, p in self.model.named_parameters() \
                    if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "name": "no decay"
            },
        ]
        # HF AdamW - One could also use the PyTorch AdamW
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate) #, eps=self.hparams.adam_epsilon)

        # get the learning rate scheduler - this is from HF
        # maybe I can change this to be a torch.optim.lr_scheduler.LinearLR
        # in case I get a warning about this scheduler being deprecated
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        # scheduler = get_linear_schedule_with_warmup(
        #     optimizer,
        #     num_warmup_steps=self.hparams.warmup_steps,
        #     num_training_steps=50*13, # used for testing
        # )

        # make an lr_scheduler_config - for more precise control of the lr scheduler
        lr_scheduler_config = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        # I choose to return a dictionary - there are many other options
        return {
            "optimizer": optimizer,
            "lr_scheduler": lr_scheduler_config
        }

## Instantiate a PL module

even later... add in benchmark timing, saving / loading, and tensorboard

In [None]:
# The default max position length is 512 for BERT
model = pl_BERT(dm.hparams.model_checkpoint, batch_size=dm.hparams.batch_size ,num_labels=dm.hparams.num_labels)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

# Make Callbacks

Docs & tutorials for model checkpointing:
* https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_basic.html#contents-of-a-checkpoint
* https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_intermediate.html#save-checkpoints-manually
* https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html#pytorch_lightning.callbacks.ModelCheckpoint

This is a stackoverflow for the logging:
https://stackoverflow.com/questions/73357742/tensorboard-is-not-creating-any-files

In [None]:
# make timer callback
# This timer will stop the training after 4 hours
timer = Timer(duration=dict(hours=4))

# checkpointing
# used this for custom formatting of the checkpointing system
ckpt_callback = ModelCheckpoint(
    monitor="val_loss",
    filename="sample-ds-{epoch:02d}-{val_loss:.2f}"
    )
# The default is to save every epoch

# Write a custom AutoFreeze Callback

When accumulating gradients, I create a zeros tensor directly on device instead of creating a tensor on CPU and then moving it to device. I learned to do this from this blog post:
https://towardsdatascience.com/7-tips-for-squeezing-maximum-performance-from-pytorch-ca4a40951259

In [None]:
from transformers.models.swin.modeling_swin import SwinSelfAttention
from pytorch_lightning.callbacks import Callback

# An autofreeze callback that follows the paper
# freeze step is how many batch steps between freezing attempts
# start layer is the first layer (aka attn block) that is non-frozen

# We assume that start layer correctly matches with the pl_module otherwise this callback doesn't
# calculate correct numbers
# We also assume that gradients will never be 0 - if it is zero then this code crashes
# We assume that the model follows Hugging face naming conventions / API
class AutoFreeze(Callback):
    def __init__(self, freeze_step: int=50, start_layer: int=0, percentile: float=50):
        super().__init__()
        self.freeze_step = freeze_step
        self.percentile = percentile
        # a dict to store the norms for the current accumulated gradients for comparison later
        self.prev_grad_norm_dict = None
        # tracks the Autofreeze module's progress i.e. what is the first non-frozen layer
        self.start_layer = start_layer
        # a dict to accumulate gradients as tensors
        self.grad_tensor_dict = {}

    # this hook works because PL zeros gradients BEFORE calling loss.backward()



    # I'm trying out disabling grad mode for these computations - appears to compile and run
    @torch.no_grad()
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        step = trainer.global_step
        # Get the BERT model
        model = pl_module.model

        # Accumulate gradient vector for this batch
        for name, param in model.named_parameters():
            if param.requires_grad:
                if name not in self.grad_tensor_dict.keys():
                    self.grad_tensor_dict[name] = torch.zeros(param.shape, device=pl_module.device)
                self.grad_tensor_dict[name] += param.grad

        # check if this is a freezing step
        if step % self.freeze_step == 0:

            # assuming the model has start layer property
            start_layer = self.start_layer
            # use hugging face API to get the number of layers
            num_layers = model.config.num_hidden_layers

            # a dict to store the norms for the current accumulated gradients
            cur_grad_norm_dict = {}
            # initialize the dictionary
            # in HF the layers start at 0 and go to n-1
            for k in range(start_layer, num_layers):
                cur_grad_norm_dict[k] = 0

            # Calculate gradient changing ratio
            for name in self.grad_tensor_dict.keys():
                if "encoder" not in name: continue
                else:
                    # regex for the HF naming conventions
                    m = re.search(r'layer.(\d+)', name)
                    if m:
                        layer_num = int(m.group(1))
                        if layer_num >= start_layer:
                            # .item() at the end is necessary otherwise torch.norm returns a tensor
                            cur_grad_norm_dict[layer_num] += torch.norm(self.grad_tensor_dict[name].detach().to("cpu"), p=1).item()

            # reset grad_tensor_dict
            self.grad_tensor_dict = {}

            # check if this is the first time computing a gradient norm dictionary
            if self.prev_grad_norm_dict is None:
                # Set gradient dict to be compared with for the first time freezing
                self.prev_grad_norm_dict = cur_grad_norm_dict
                return

            # otherwise compute a dict of values to be used for the freezing condition
            # init a dict for storing the values to be compared to a percentile threshold later
            threshold_dict = {}
            for k in cur_grad_norm_dict.keys():
                threshold_dict[k] = 0
            # Calculate gradient changing threshold
            for key in cur_grad_norm_dict.keys() :
                threshold_dict[key] = abs(self.prev_grad_norm_dict[key] - cur_grad_norm_dict[key]) / self.prev_grad_norm_dict[key]
            # check which layer to freeze up to
            median_value = np.percentile(list(threshold_dict.values()), self.percentile)

            new_start_layer = start_layer
            # Find out the first layer with ratio get to the median value
            for key, val in threshold_dict.items():
                if val >= median_value:
                    new_start_layer = key
                    break
            # store the dict of grad norms for later comparison
            self.prev_grad_norm_dict = cur_grad_norm_dict

            # If there is no start layer update then don't bother freezing anything
            if start_layer == new_start_layer: return

            # Otherwise freeze everything, including embeddings up to the start layer
            # I'm assuming that the model lists the parameters in order of what the inputs get sent through on a forward pass
            # I need this in order to freeze everything up to the start layer
            for name, param in model.named_parameters():
                # freeze the parameter
                param.requires_grad = False
                # using the HF BERT naming conventions
                # check if I need to break the loop
                m = re.search(r'layer.(\d+)', name)
                if m:
                    layer_num = int(m.group(1))
                    if layer_num >= new_start_layer:
                        param.requires_grad = True
                        break

            print(f"New start layer: {new_start_layer}")
            self.start_layer = new_start_layer



In [None]:
AF = AutoFreeze(freeze_step = 5)

# Make a trainer

It seems like I don't need to remake the optimizers every time?

https://discuss.pytorch.org/t/passing-to-the-optimizers-frozen-parameters/83358

I only checked it by inspecting a frozen layer and an unfrozen layer and it seems to work as expected...

In [None]:
# This callback creates Keras-like model summaries
# PyTorch doesn't have this feature and the one of the other 3rd party packages
# has gone through name changes so it can be tricky to find online
from pytorch_lightning.callbacks import ModelSummary

# The profiler - 'simple' appears more helpful for finding bottlenecks I think


In [None]:
# get a lightning trainer
trainer = Trainer(
    # fast_dev_run=False,
    # limit_train_batches = 2,
    limit_val_batches=0,
    max_steps=20,
    accelerator="auto", # automatically detects which devices are available, the number of devices will be inferred
    callbacks=[AF, timer],
    enable_checkpointing=False,
    deterministic=False,
    # profiler="simple",
    # auto_scale_batch_size=True, # runs an initial batch size finder
    # check_val_every_n_epoch=1, # default is 1
    # log_every_n_steps=2, # default is 50
    # logger=wandb_logger, # the wandb logger
    # precision=16, # for mixed precision, default is 32
    # max_epochs=3, # default is 1000
    )

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


# Use the trainer to fit the model

In [None]:
# It appears to run once without crashing immediately...
trainer.fit(model, datamodule=dm)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.247   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=20` reached.


In [None]:
print(timedelta(seconds=timer.time_elapsed("train")))
# with the AF freezes up to layer 0 and takes - 0:06:01.525987 and loss=0.671, v_num=8, train_loss=0.642

# without AF - 0:05:46.667911 and loss=0.703, v_num=7, train_loss=0.764

0:06:01.525987


In [None]:
print(AF.start_layer)

0


In [None]:
AF.prev_grad_norm_dict

{0: 901.0577737448461,
 1: 1057.5339045126689,
 2: 1145.3708180816134,
 3: 1238.3236778440748,
 4: 1321.705404381701,
 5: 1303.9866794209997,
 6: 1379.57144509617,
 7: 1493.8512396059134,
 8: 1767.5263754330672,
 9: 1827.9177428937778,
 10: 1681.587187383795,
 11: 1453.1298119470734}

In [None]:
for n,p in model.model.named_parameters():
    if p.requires_grad:
        print(n)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc