In [None]:
# https://huggingface.co/transformers/v4.8.2/training.html

In [1]:
from datasets import load_dataset
import numpy as np
from torchmetrics.classification import Accuracy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

raw_datasets = load_dataset("imdb")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Found cached dataset imdb (/Users/aniket/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 64.20it/s]
Loading cached processed dataset at /Users/aniket/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-7ed6e90bbf90227c.arrow
Loading cached processed dataset at /Users/aniket/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-cf3c4992ffb060a8.arrow
                                                                   

In [3]:
# metric = Accuracy("binary")

# def compute_metrics(logits, labels):
#     predictions = np.argmax(logits, axis=-1)
#     return metric(logits, labels)

In [4]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Loading cached shuffled indices for dataset at /Users/aniket/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-dc5fd447a4d8ba53.arrow
Loading cached shuffled indices for dataset at /Users/aniket/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-1e1cc27f0ba31ddf.arrow


In [5]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [6]:
from transformers import AdamW

In [7]:
from transformers import get_scheduler

num_epochs = 3

In [8]:
import lightning.pytorch as pl
from transformers import AutoModelForSequenceClassification

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
class LitTransformer(pl.LightningModule):

    def __init__(self, num_labels=2) -> None:
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)
    
    def forward(self, data):
        return self.model(**data)
    
    def training_step(self, batch, batch_idx):
        stage = "train"
        outputs = self(batch)
        loss = outputs.loss
        self.log(f"{stage}_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=5e-5)
        return optimizer
        num_training_steps = self.trainer.max_epochs * self.trainer.num_training_batches
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )
        return {"optimizer":optimizer, "lr_scheduler":lr_scheduler}

    
model = LitTransformer()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [10]:
trainer = pl.Trainer(max_epochs=1)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
trainer.fit(model, train_dataloaders=train_dataloader)


  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.247   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   2%|▏         | 2/125 [00:25<26:04, 12.72s/it, v_num=1]