<a href="https://colab.research.google.com/github/andygma567/AutoFreeze-experiment/blob/main/AF_tests_5_big_run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [None]:
!pip install -q transformers pytorch-lightning datasets evaluate wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.4/716.4 KB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# HuggingFace
import transformers
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
    )
import evaluate

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import (
    Timer,
    BatchSizeFinder,
    ModelCheckpoint,
    EarlyStopping,
    )
from pytorch_lightning.loggers import WandbLogger

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
import torch.utils.data as data
from torch.utils.data import random_split, DataLoader

# For logging and checkpointing
import os
from datetime import timedelta
import wandb

# for parsing strings
import re

# numpy
import numpy as np


In [None]:
# loading takes about 2-3 min
# raw_dataset = load_dataset("amazon_polarity")
# dataset = load_dataset("yelp_review_full")

# Get the tokenized dataset off of my Google Drive

In [None]:
if not os.path.isdir('/content/amazon_polarity_tokenized'):
    print("The tokenized data needs to be loaded first.")
    !cp -r '/content/drive/MyDrive/Colab Notebooks/AutoFreeze Experiment/amazon_polarity_tokenized' /content
else:
    print('The tokenized dataset is already loaded')

The tokenized data needs to be loaded first.


# Set up a datamodule

https://pytorch-lightning.readthedocs.io/en/latest/data/datamodule.html#lightningdatamodule-api

These are the links I used to combine data_collators with the data_loader:

*   https://huggingface.co/course/chapter3/3?fw=pt
*   https://huggingface.co/course/chapter7/2?fw=pt#a-custom-training-loop
*   https://huggingface.co/course/chapter2/5?fw=pt#padding-the-inputs

This link is to the collate function:

*   https://pytorch.org/docs/stable/data.html



In [None]:
class amazonDataModule(pl.LightningDataModule):
    def __init__(
        self,
        model_checkpoint: str = "bert-base-cased",
        data_dir: str = "amazon_polarity",
        max_seq_length: int = 512,
        batch_size: int = 8, # 40 was found via the trainer.tune for batch_size
        num_labels=2, # for passing to the lightning module later
      ):
        super().__init__()
        self.save_hyperparameters()

        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_checkpoint)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

    # This is a tokenizer function to be used in the prepare_data method
    def tokenize_function(self, examples):
        title_and_content = [ title + ': ' + content for title, content in zip(examples['title'], examples['content'])]
        model_inputs = self.tokenizer(
        title_and_content,
        max_length=self.hparams.max_seq_length,
        truncation=True,
    )
        return model_inputs

    # load, tokenize, and save the tokenized dataset to disk
    # I read that it's better to tokenize on a single processor
    def prepare_data(self):
        # check if the tokenized dataset already exists
        if os.path.isdir(self.hparams.data_dir + "_tokenized"):
            # print("The data is already tokenized")
            return

        # check if the dataset is from the Huggingface or if this is a sample dataset
        if self.hparams.data_dir == "amazon_polarity":
            raw_dataset = load_dataset(self.hparams.data_dir)
        else:
            raw_dataset = load_from_disk(self.hparams.data_dir)

        # tokenize using HuggingFace
        tokenized_ds = raw_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=['title', 'content'])
        tokenized_ds.save_to_disk(self.hparams.data_dir + "_tokenized")

    def setup(self, stage: str):
        # check if a tokenized dataset already exists
        if not os.path.isdir(self.hparams.data_dir + "_tokenized"):
            print("The data needs to be tokenized first.")
            return
        else:
            # load the tokenized dataset from disk
            tokenized_ds = load_from_disk(self.hparams.data_dir + "_tokenized")
            # Assign train/val datasets for use in dataloaders
            if stage == 'fit':
                self.tokenized_val, self.tokenized_train = random_split(tokenized_ds['train'], [0.2,0.8], generator=torch.Generator().manual_seed(42))
            # Assign test dataset for use in dataloader(s)
            if stage == "test":
                self.tokenized_test = tokenized_ds['test']

    # return a dataloader and use a hugging face data collator for the collate_fn
    def train_dataloader(self):
        return DataLoader(self.tokenized_train, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)

    def val_dataloader(self):
        return DataLoader(self.tokenized_val, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)

    def test_dataloader(self):
        return DataLoader(self.tokenized_test, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)


## Instantiate a datamodule



In [None]:
# dm = amazonDataModule(data_dir="/content/sample_ds_dict")

# load the full dataset
dm = amazonDataModule()

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# I'm using a presaved tokenized dataset so I can skip this
# dm.prepare_data()

In [None]:
dm.setup(stage="fit")

# Make a Lightning Module

[PL transformers example](https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/text-transformers.html#Transformer-LightningModule)

[HF evaluation metrics](https://huggingface.co/docs/evaluate/a_quick_tour)

It's usually good to use .detach().to(self.device).numpy() to get a numpy array or use .item() / .tolist() to make a python float / list on CPU. Here are some links regarding this:
* [7. Remove any .cuda() or .to(device) Calls](https://pytorch-lightning.readthedocs.io/en/latest/starter/converting.html#remove-any-cuda-or-to-device-calls)

[BertForSequenceClassification that I use](https://huggingface.co/docs/transformers/v4.26.0/en/model_doc/bert#transformers.BertForSequenceClassification
)

[2nd example of a training loop](https://huggingface.co/course/chapter3/4?fw=pt#the-training-loop)

[PL automatic logging docs](https://pytorch-lightning.readthedocs.io/en/latest/extensions/logging.html#automatic-logging)


For now, I'm adjusting the scheduler to assume everything runs for 50 epochs for the sake of testing

In [None]:
# This model only logs metrics on the validation epoch
class pl_BERT(pl.LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        learning_rate: float = 2e-5,
        # adam_epsilon: float = 1e-8,
        warmup_steps: int = 5,
        weight_decay: float = 1e-5,
        batch_size = 8, # it shows an error if the batch size doesn't match with the datamodule batch_size
        # train_batch_size: int = 32,
        # eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.save_hyperparameters()
        # Alternatively one can make and pass in a config object
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path,
            num_labels=self.hparams.num_labels)
        # metrics
        self.clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        # log the losses
        # self.log("train_loss", loss, prog_bar=True)
        return loss

    # Instatiate a new metric every validation run
    def on_validation_epoch_start(self):
        self.clf_metrics = self.clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    # compute the val_loss and load the evaluator for the epoch metrics
    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        val_loss, logits = outputs.loss, outputs.logits
        # pass everything as python lists
        preds = torch.argmax(outputs.logits, dim=-1).tolist()
        refs = batch.labels.tolist()
        # load the predications into the evaluator
        self.clf_metrics.add_batch(references=refs, predictions=preds)

        # log the results - the default only logs val epoch loss
        # it was giving me trouble until I used logger=True with the prog_bar=True
        self.log("val_loss", val_loss, prog_bar=True, logger=True)

    # log or print the metrics for each validation epoch
    def on_validation_epoch_end(self):
        split_metrics = {
            f"val_{k}": v for k, v in self.clf_metrics.compute().items()
        }
        self.log_dict(split_metrics, prog_bar=True, logger=True)

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        # I got this from HF/ lightning. We don't want the weight decay to change the bias or the layer normalization
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() \
                    if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
                "name": "decay"
            },
            {
                "params": [p for n, p in self.model.named_parameters() \
                    if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "name": "no decay"
            },
        ]
        # HF AdamW - One could also use the PyTorch AdamW
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate) #, eps=self.hparams.adam_epsilon)

        # get the learning rate scheduler - this is from HF
        # maybe I can change this to be a torch.optim.lr_scheduler.LinearLR
        # in case I get a warning about this scheduler being deprecated
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        # scheduler = get_linear_schedule_with_warmup(
        #     optimizer,
        #     num_warmup_steps=self.hparams.warmup_steps,
        #     num_training_steps=50*13, # used for testing
        # )

        # make an lr_scheduler_config - for more precise control of the lr scheduler
        lr_scheduler_config = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        # I choose to return a dictionary - there are many other options
        return {
            "optimizer": optimizer,
            "lr_scheduler": lr_scheduler_config
        }

## Instantiate a PL module

even later... add in benchmark timing, saving / loading, and tensorboard

In [None]:
# The default max position length is 512 for BERT
model = pl_BERT(dm.hparams.model_checkpoint, batch_size=dm.hparams.batch_size ,num_labels=dm.hparams.num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

# Make Callbacks

Docs & tutorials for model checkpointing:
* https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_basic.html#contents-of-a-checkpoint
* https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_intermediate.html#save-checkpoints-manually
* https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html#pytorch_lightning.callbacks.ModelCheckpoint

This is a stackoverflow for the logging:
https://stackoverflow.com/questions/73357742/tensorboard-is-not-creating-any-files

In [None]:
# make timer callback
# This timer will stop the training after 3 hours
timer = Timer(duration=dict(hours=3))

# checkpointing
# used this for custom formatting of the checkpointing system
ckpt_callback = ModelCheckpoint(
    monitor="val_loss",
    filename="full-ds-{epoch:02d}-{val_loss:.2f}",
    every_n_train_steps=3000, # I want this to ckpt everytime it does a validation too
    save_top_k=2
    )

# The default is to save every epoch
# early stopping, patience default = 3
early_stopping = EarlyStopping('val_loss')

# Write a custom AutoFreeze Callback

When accumulating gradients, I create a zeros tensor directly on device instead of creating a tensor on CPU and then moving it to device. I learned to do this from this blog post:
https://towardsdatascience.com/7-tips-for-squeezing-maximum-performance-from-pytorch-ca4a40951259

In [None]:
from transformers.models.swin.modeling_swin import SwinSelfAttention
from pytorch_lightning.callbacks import Callback

# An autofreeze callback that follows the paper
# freeze step is how many batch steps between freezing attempts
# start layer is the first layer (aka attn block) that is non-frozen

# We assume that start layer correctly matches with the pl_module otherwise this callback doesn't
# calculate correct numbers
# We also assume that gradients will never be 0 - if it is zero then this code crashes
# We assume that the model follows Hugging face naming conventions / API
# This needs to run on 32-bit precision bc in testing 16-bit mixed creates inf values
class AutoFreeze(Callback):
    def __init__(self, freeze_step: int=50, start_layer: int=0, percentile: float=50):
        super().__init__()
        self.freeze_step = freeze_step
        self.percentile = percentile
        # a dict to store the norms for the current accumulated gradients for comparison later
        self.prev_grad_norm_dict = None
        # tracks the Autofreeze module's progress i.e. what is the first non-frozen layer
        self.start_layer = start_layer
        # a dict to accumulate gradients as tensors
        self.grad_tensor_dict = {}

    # this hook works because PL zeros gradients BEFORE calling loss.backward()
    # I'm trying out disabling grad mode for these computations - appears to compile and run
    @torch.no_grad()
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        step = trainer.global_step
        # Get the BERT model
        model = pl_module.model

        # Accumulate gradient vector for this batch
        for name, param in model.named_parameters():
            if param.requires_grad:
                if name not in self.grad_tensor_dict.keys():
                    self.grad_tensor_dict[name] = torch.zeros(param.shape, device=pl_module.device)
                self.grad_tensor_dict[name] += param.grad

        # check if this is a freezing step
        if step % self.freeze_step == 0:

            # assuming the model has start layer property
            start_layer = self.start_layer
            # use hugging face API to get the number of layers
            num_layers = model.config.num_hidden_layers

            # a dict to store the norms for the current accumulated gradients
            cur_grad_norm_dict = {}
            # initialize the dictionary
            # in HF the layers start at 0 and go to n-1
            for k in range(start_layer, num_layers):
                cur_grad_norm_dict[k] = 0

            # Calculate gradient changing ratio
            for name in self.grad_tensor_dict.keys():
                if "encoder" not in name: continue
                else:
                    # regex for the HF naming conventions
                    m = re.search(r'layer.(\d+)', name)
                    if m:
                        layer_num = int(m.group(1))
                        if layer_num >= start_layer:
                            # .item() at the end is necessary otherwise torch.norm returns a tensor
                            cur_grad_norm_dict[layer_num] += torch.norm(self.grad_tensor_dict[name].detach().to("cpu"), p=1).item()

            # reset grad_tensor_dict
            self.grad_tensor_dict = {}

            # check if this is the first time computing a gradient norm dictionary
            if self.prev_grad_norm_dict is None:
                # Set gradient dict to be compared with for the first time freezing
                self.prev_grad_norm_dict = cur_grad_norm_dict
                print()
                print('Made the prev_grad_norm_dict for the first time')
                return

            # otherwise compute a dict of values to be used for the freezing condition
            # init a dict for storing the values to be compared to a percentile threshold later
            threshold_dict = {}
            for k in cur_grad_norm_dict.keys():
                threshold_dict[k] = 0
            # Calculate gradient changing threshold
            for key in cur_grad_norm_dict.keys() :
                threshold_dict[key] = abs(self.prev_grad_norm_dict[key] - cur_grad_norm_dict[key]) / self.prev_grad_norm_dict[key]
            # check which layer to freeze up to
            median_value = np.percentile(list(threshold_dict.values()), self.percentile)

            new_start_layer = start_layer
            # Find out the first layer with ratio get to the median value
            for key, val in threshold_dict.items():
                if val >= median_value:
                    new_start_layer = key
                    break
            # store the dict of grad norms for later comparison
            self.prev_grad_norm_dict = cur_grad_norm_dict

            print('')
            print("Calculating the current start layer...")

            # If there is no start layer update then don't bother freezing anything
            if start_layer == new_start_layer:
              print("No update to the start layer this time.")
              return

            # Otherwise freeze everything, including embeddings up to the start layer
            # I'm assuming that the model lists the parameters in order of what the inputs get sent through on a forward pass
            # I need this in order to freeze everything up to the start layer
            for name, param in model.named_parameters():
                # freeze the parameter
                param.requires_grad = False
                # using the HF BERT naming conventions
                # check if I need to break the loop
                m = re.search(r'layer.(\d+)', name)
                if m:
                    layer_num = int(m.group(1))
                    if layer_num >= new_start_layer:
                        param.requires_grad = True
                        break

            print(f"New start layer: {new_start_layer}")
            self.start_layer = new_start_layer

In [None]:
# The original paper uses AF evaluatin 5 times per epoch
# I'll need to tune to find out how many steps per epoch

# at the start it's roughly 1000 steps / 5 min
AF = AutoFreeze(freeze_step = 3000)

# Set up WandB

* https://pytorch-lightning.readthedocs.io/en/stable/visualize/logging_intermediate.html#weights-and-biases
* https://pytorch-lightning.readthedocs.io/en/stable/extensions/generated/pytorch_lightning.loggers.WandbLogger.html#pytorch_lightning.loggers.WandbLogger

* [Stack Overflow: How to graph two metrics on one chart in wandb](https://stackoverflow.com/questions/71432453/getting-aligned-val-loss-and-train-loss-plots-for-each-epoch-using-wandb-rather)

Next time I'd like to log a few examples as I go too because it would be valuable to also see what the training data looks like.

* [PL docs on logging](https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html)

In [None]:
wandb.login()

# make a wandb logger
wandb_logger = WandbLogger(project="BERT", log_model="all")
trainer = Trainer(logger=wandb_logger)

# log gradients and model topology
wandb_logger.watch(model)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mandygma567[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


# Make a trainer

It seems like I don't need to remake the optimizers every time?

https://discuss.pytorch.org/t/passing-to-the-optimizers-frozen-parameters/83358

I only checked it by inspecting a frozen layer and an unfrozen layer and it seems to work as expected...

In [None]:
# This callback creates Keras-like model summaries
# PyTorch doesn't have this feature and the one of the other 3rd party packages
# has gone through name changes so it can be tricky to find online
from pytorch_lightning.callbacks import ModelSummary

# The profiler - 'simple' appears more helpful for finding bottlenecks I think

In [None]:
# get a lightning trainer
trainer = Trainer(
    # fast_dev_run=False,
    # limit_val_batches=0, # set to 0 to skip validation
    accelerator="auto", # automatically detects which devices are available, the number of devices will be inferred
    callbacks=[
        AF,
        timer,
        ckpt_callback,
        early_stopping,
        BatchSizeFinder()
        ],
    enable_checkpointing=True, # I made a custom model ckpt callback
    deterministic=False,
    # profiler="simple",
    val_check_interval = 3000,
    # log_every_n_steps=2, # default is 50 - I think I need to manually log stuff in order to monitor with callbacks too
    logger=wandb_logger, # the wandb logger
    precision='32-true', # for mixed precision, default is 32
    max_steps=7000,
    )

# I'm getting inf grad values at ~50 steps with 16-mixed precision
# bf16 doesn't always work with the connected gpu
# using 32-true bit precision fixed the inf values at ~50 steps it seems...

# I'll do a more complete run soon

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


# Use the trainer to fit the model

In [None]:
# It appears to run once without crashing immediately...
trainer.fit(model, datamodule=dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=3` reached.
INFO:pytorch_lightning.utilities.rank_zero:Batch size 2 succeeded, trying batch size 4
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=3` reached.
INFO:pytorch_lightning.utilities.rank_zero:Batch size 4 succeeded, trying batch size 8
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=3` reached.
INFO:pytorch_lightning.utilities.rank_zero:Batch size 8 succeeded, trying batch size 16
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=3` reached.
INFO:pytorch_lightning.utilities.rank_zero:Batch size 16 succeeded, tryin

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]


Made the prev_grad_norm_dict for the first time


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Time limit reached. Elapsed time is 1:00:00. Signaling Trainer to stop.


Validation: 0it [00:00, ?it/s]

# Post training

In [None]:
print(timedelta(seconds=timer.time_elapsed("train")))
# with the AF freezes up to layer 0 and takes - 0:06:01.525987 and loss=0.671, v_num=8, train_loss=0.642

# without AF - 0:05:46.667911 and loss=0.703, v_num=7, train_loss=0.764

1:00:01.823331


In [None]:
print(AF.start_layer)

0


In [None]:
AF.prev_grad_norm_dict

{0: 105944.37242457802,
 1: 128932.15504684516,
 2: 131704.10280211916,
 3: 130515.74319048869,
 4: 135611.10809192207,
 5: 141756.26337356842,
 6: 148263.15966993722,
 7: 129656.36796277296,
 8: 110606.47417370556,
 9: 67624.59483582985,
 10: 44390.21681899173,
 11: 37093.85985938202}

In [None]:
# This wraps up the wandb session
wandb.finish()


0,1
epoch,▁▁▁
trainer/global_step,▁▅█
val_accuracy,▁██
val_f1,▁██
val_loss,█▁▁
val_precision,▁██
val_recall,▁▁▁

0,1
epoch,0.0
trainer/global_step,5555.0
val_accuracy,1.0
val_f1,1.0
val_loss,0.03194
val_precision,1.0
val_recall,1.0


In [None]:
# for n,p in model.model.named_parameters():
#     if p.requires_grad:
#         print(n)