<a href="https://colab.research.google.com/github/andygma567/AutoFreeze-experiment/blob/main/WandB_Benchmarking_for_AutoFreeze.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [None]:
!pip install -q transformers pytorch-lightning datasets evaluate
# !pip install -q pytorch-lightning
# !pip install -q datasets
# !pip install -q evaluate
! pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m826.2/826.2 KB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# HuggingFace
import transformers
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
    )
import evaluate

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import (
    Timer,
    BatchSizeFinder,
    ModelCheckpoint,
    )
from pytorch_lightning.loggers import WandbLogger

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW
import torch.utils.data as data
from torch.utils.data import random_split, DataLoader

# For logging and checkpointing
import os
from datetime import timedelta
import wandb


In [None]:
# loading takes about 2-3 min
raw_dataset = load_dataset("amazon_polarity")
# dataset = load_dataset("yelp_review_full")

Downloading builder script:   0%|          | 0.00/4.11k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.64k [00:00<?, ?B/s]

Downloading and preparing dataset amazon_polarity/amazon_polarity to /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc...


Downloading data:   0%|          | 0.00/688M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

Dataset amazon_polarity downloaded and prepared to /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

# Make a sample dataset

In [None]:
# I'd like to set up a smaller data dictionary sample to run data through
sample_ds_dict = DatasetDict(
    {
    'train': raw_dataset['train'].select(range(100)),
    'test':raw_dataset['test'].select(range(20))
    }
)
print(sample_ds_dict)
sample_ds_dict.save_to_disk("sample_ds_dict")

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 100
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 20
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

# Set up a datamodule

https://pytorch-lightning.readthedocs.io/en/latest/data/datamodule.html#lightningdatamodule-api

These are the links I used to combine data_collators with the data_loader:

*   https://huggingface.co/course/chapter3/3?fw=pt
*   https://huggingface.co/course/chapter7/2?fw=pt#a-custom-training-loop
*   https://huggingface.co/course/chapter2/5?fw=pt#padding-the-inputs

This link is to the collate function:

*   https://pytorch.org/docs/stable/data.html



In [None]:
class amazonDataModule(pl.LightningDataModule):
    def __init__(
        self,
        model_checkpoint: str = "bert-base-cased",
        data_dir: str = "amazon_polarity",
        max_seq_length: int = 512,
        batch_size: int = 40, # 40 was found via the trainer.tune for batch_size
        num_labels=2, # for passing to the lightning module later
      ):
        super().__init__()
        self.save_hyperparameters()

        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_checkpoint)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

    # This is a tokenizer function to be used in the prepare_data method
    def tokenize_function(self, examples):
        title_and_content = [ title + ': ' + content for title, content in zip(examples['title'], examples['content'])]
        model_inputs = self.tokenizer(
        title_and_content,
        max_length=self.hparams.max_seq_length,
        truncation=True,
    )
        return model_inputs

    # load, tokenize, and save the tokenized dataset to disk
    # I read that it's better to tokenize on a single processor
    def prepare_data(self):

        # check if the dataset is from the Huggingface or if this is a sample dataset
        if self.hparams.data_dir == "amazon_polarity":
            raw_dataset = load_dataset(self.hparams.data_dir)
        else:
            raw_dataset = load_from_disk(self.hparams.data_dir)

        # tokenize using HuggingFace
        tokenized_ds = raw_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=['title', 'content'])
        tokenized_ds.save_to_disk(self.hparams.data_dir + "_tokenized")

    def setup(self, stage: str):
        # check if a tokenized dataset already exists
        if not os.path.isdir(self.hparams.data_dir + "_tokenized"):
            print("The data needs to be tokenized first.")
            return
        else:
            # load the tokenized dataset from disk
            tokenized_ds = load_from_disk(self.hparams.data_dir + "_tokenized")
            # Assign train/val datasets for use in dataloaders
            if stage == 'fit':
                self.tokenized_val, self.tokenized_train = random_split(tokenized_ds['train'], [0.2,0.8], generator=torch.Generator().manual_seed(42))
            # Assign test dataset for use in dataloader(s)
            if stage == "test":
                self.tokenized_test = tokenized_ds['test']

    # return a dataloader and use a hugging face data collator for the collate_fn
    def train_dataloader(self):
        return DataLoader(self.tokenized_train, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)

    def val_dataloader(self):
        return DataLoader(self.tokenized_val, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)

    def test_dataloader(self):
        return DataLoader(self.tokenized_test, collate_fn=self.data_collator, batch_size=self.hparams.batch_size)


## Instantiate a datamodule



In [None]:
dm = amazonDataModule(data_dir="/content/sample_ds_dict")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
dm.prepare_data()

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
dm.setup(stage="fit")
# # This appears to work
# print(dm.train_dataloader())

# Make a Lightning Module

[PL transformers example](https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/text-transformers.html#Transformer-LightningModule)

[HF evaluation metrics](https://huggingface.co/docs/evaluate/a_quick_tour)

It's usually good to use .detach().to(self.device).numpy() to get a numpy array or use .item() / .tolist() to make a python float / list on CPU. Here are some links regarding this:
* [7. Remove any .cuda() or .to(device) Calls](https://pytorch-lightning.readthedocs.io/en/latest/starter/converting.html#remove-any-cuda-or-to-device-calls)

[BertForSequenceClassification that I use](https://huggingface.co/docs/transformers/v4.26.0/en/model_doc/bert#transformers.BertForSequenceClassification
)

[2nd example of a training loop](https://huggingface.co/course/chapter3/4?fw=pt#the-training-loop)

[PL automatic logging docs](https://pytorch-lightning.readthedocs.io/en/latest/extensions/logging.html#automatic-logging)

In [None]:
# This model only logs metrics on the validation epoch
class pl_BERT(pl.LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        learning_rate: float = 2e-5,
        # adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        batch_size = 8, # it shows an error if the batch size doesn't match with the datamodule batch_size
        # train_batch_size: int = 32,
        # eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.save_hyperparameters()
        # Alternatively one can make and pass in a config object
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name_or_path,
            num_labels=self.hparams.num_labels)
        # metrics
        self.clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        # log the losses
        self.log("train_loss", loss, prog_bar=True)
        return loss

    # Instatiate a new metric every validation run
    def on_validation_epoch_start(self):
        self.clf_metrics = self.clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
        return

    # compute the val_loss and load the evaluator for the epoch metrics
    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        val_loss, logits = outputs.loss, outputs.logits
        # pass everything as python lists
        preds = torch.argmax(outputs.logits, dim=-1).tolist()
        refs = batch.labels.tolist()
        # load the predications into the evaluator
        self.clf_metrics.add_batch(references=refs, predictions=preds)
        # log the results - the default only logs val epoch loss
        self.log("val_loss", val_loss, prog_bar=True)
        # no returns needed
        return

    # log or print the metrics for each validation epoch
    def on_validation_epoch_end(self):
        split_metrics = {
            f"val_{k}": v for k, v in self.clf_metrics.compute().items()
        }
        self.log_dict(split_metrics, prog_bar=True, logger=True)
        return

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        # model = self.model

        # I got this from HF/ lightning. We don't want the weight decay to change the bias or the layer normalization
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        # HF AdamW - One could also use the PyTorch AdamW
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate) #, eps=self.hparams.adam_epsilon)

        # get the learning rate scheduler - this is from HF
        # maybe I can change this to be a torch.optim.lr_scheduler.LinearLR
        # in case I get a warning about this scheduler being deprecated
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        # make an lr_scheduler_config - for more precise control of the lr scheduler
        lr_scheduler_config = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        # I choose to return a dictionary - there are many other options
        return {
            "optimizer": optimizer,
            "lr_scheduler": lr_scheduler_config
        }

## Instantiate a PL module

even later... add in benchmark timing, saving / loading, and tensorboard

In [None]:
print(dm.hparams)

"batch_size":       40
"data_dir":         /content/sample_ds_dict
"max_seq_length":   512
"model_checkpoint": bert-base-cased
"num_labels":       2


In [None]:
# The default max position length is 512 for BERT
# model_checkpoint = "bert-base-cased"

my_lightning_model = pl_BERT(dm.hparams.model_checkpoint, batch_size=dm.hparams.batch_size ,num_labels=dm.hparams.num_labels)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

# Make Callbacks and Trainer

Docs & tutorials for model checkpointing:
* https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_basic.html#contents-of-a-checkpoint
* https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_intermediate.html#save-checkpoints-manually
* https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html#pytorch_lightning.callbacks.ModelCheckpoint

This is a stackoverflow for the logging:
https://stackoverflow.com/questions/73357742/tensorboard-is-not-creating-any-files

## set up WandB

* https://pytorch-lightning.readthedocs.io/en/stable/visualize/logging_intermediate.html#weights-and-biases
* https://pytorch-lightning.readthedocs.io/en/stable/extensions/generated/pytorch_lightning.loggers.WandbLogger.html#pytorch_lightning.loggers.WandbLogger

* [Stack Overflow: How to graph two metrics on one chart in wandb](https://stackoverflow.com/questions/71432453/getting-aligned-val-loss-and-train-loss-plots-for-each-epoch-using-wandb-rather)

Next time I'd like to log a few examples as I go too because it would be valuable to also see what the training data looks like.

* [PL docs on logging](https://pytorch-lightning.readthedocs.io/en/stable/extensions/logging.html)

In [None]:
wandb.login()

# make a wandb logger
wandb_logger = WandbLogger(project="BERT", log_model="all")
trainer = Trainer(logger=wandb_logger)

# log gradients and model topology
wandb_logger.watch(my_lightning_model)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Hint: Upgrade with `pip install --upgrade wandb`.
  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(
[34m[1mwandb[0m: Currently logged in as: [33mandygma567[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


In [None]:
# make timer callback
# This timer will stop the training after 4 hours
timer = Timer(duration=dict(hours=4))

# checkpointing
# used this for custom formatting of the checkpointing system
ckpt_callback = ModelCheckpoint(
    monitor="val_loss",
    filename="sample-ds-{epoch:02d}-{val_loss:.2f}"
    )
# The default is to save every epoch

In [None]:
# get a lightning trainer
trainer = Trainer(
    fast_dev_run=False,
    accelerator="auto", # automatically detects which devices are available, the number of devices will be inferred
    callbacks=[
        timer,
        ckpt_callback,
        ],
    auto_scale_batch_size=True, # runs an initial batch size finder
    check_val_every_n_epoch=1, # default is 1
    log_every_n_steps=2, # default is 50
    logger=wandb_logger, # the wandb logger
    precision=16, # for mixed precision, default is 32
    max_epochs=3, # default is 1000
    )

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit None Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


# Use the trainer to find the batch_size

In [None]:
# Tune the trainer / batch_size
# trainer.tune(my_lightning_model, datamodule=dm)

In [None]:
# looks like the best batch size is 40 and I should hard code that into my data module
# my_lightning_model.hparams

# Use the trainer to fit the model

In [None]:
# It appears to run once without crashing immediately...
trainer.fit(my_lightning_model, datamodule=dm)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
216.624   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
# check that the timer callback is working
x = timer.time_elapsed("train")
print(timedelta(seconds=x))

0:01:09.509452


In [None]:
wandb.finish()

VBox(children=(Label(value='3719.366 MB of 3719.366 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.…

0,1
epoch,▁▁▅▅██
train_loss,█▂▁
trainer/global_step,▁▁▅▅██
val_accuracy,▁██
val_f1,▁██
val_loss,█▃▁
val_precision,▁██
val_recall,▁▁▁

0,1
epoch,2.0
train_loss,0.65622
trainer/global_step,5.0
val_accuracy,0.45
val_f1,0.59259
val_loss,0.68972
val_precision,0.42105
val_recall,1.0
