In [20]:
%%capture
!pip install transformers datasets accelerate

# Pretraining Language Models

## Code Dataset

The `huggingface-course/codeparrot-ds-train` dataset is a subset of Python files from GitHub repositories which was filtered for data science scripts by looking for `pandas`, `scikit-learn` etc. imports. We'll just take a subset of of 50k samples from it.

In [3]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train[:50000]")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,
        "valid": ds_valid,
    }
)

raw_datasets



DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

## Tokenizer

### Tokenizer fertility

One important reason to pretrain a language model is to have a specialized tokenizer. E.g. if one took a tokenizer trained on natural text and then applied it protein sequences the tokenizer would not be very well adapted. 

Although there are no "unknown" tokens in most subword tokenizers what happens instead is that words are split into many small subwords. Let's have a look at this on the code dataset.

The `gpt2` tokenizer has been mostly trained on natural text from the internet wheras the `codeparrot/codeparrot` tokenizer was specifically trained on Python code.

In [4]:
from transformers import AutoTokenizer

gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")

codeparrot_tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot")

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The `token_to_char_ratio` computes the number of tokens and characters of a list of texts and returns their ratio.

In [7]:
def token_to_char_ratio(texts, tokenizer):
  input_ids = tokenizer(texts).input_ids

  text_length = sum([len(text) for text in texts])
  token_length = sum([len(tokens) for tokens in input_ids])

  return token_length / text_length


In [8]:
token_to_char_ratio(raw_datasets["train"][:1000]["content"], gpt_tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (4367 > 1024). Running this sequence through the model will result in indexing errors


0.46598530692150386

In [10]:
token_to_char_ratio(raw_datasets["train"][:1000]["content"], codeparrot_tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (2605 > 1024). Running this sequence through the model will result in indexing errors


0.27712269623287494

We can see that the dedicated tokenizer is almost twice as efficient as the GPT2 tokenizer. This effectively means that your context window doubles for free without changes the number of input tokens.

### Train a tokenizer

If you want to train a tokenizer on your own dataset you can use the following code:

In [None]:
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

vocab_size = 20_000
training_samples = 10_000

# Iterator for Training
def batch_iterator(batch_size=10):
    for _ in tqdm(range(0, training_samples, batch_size)):
        yield [next(iter_dataset)["content"] for _ in range(batch_size)]

# Base tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
base_vocab = list(bytes_to_unicode().values())

# Load dataset
dataset = load_dataset("huggingface-course/codeparrot-ds-train", split="train", streaming=True)
iter_dataset = iter(dataset)

# Training and saving
new_tokenizer = tokenizer.train_new_from_iterator(
    batch_iterator(), vocab_size=vocab_size, initial_alphabet=base_vocab
)

## Preprocessing

Before we can start pretraining we need to preprocess the training dataset. As with other datasets we tokenize the texts with two differences:

- we make use of the overflowing tokens (no truncation)
- there are no labels (input is also label, but shifted by one position)

To make these examples run fast even on a single, small GPU we'll use a `context_length=128`

After that we pass the datasets to standard PyTorch data loaders with `batch_size=32`.

In [12]:
context_length = 128

In [29]:
def tokenize(element):
    outputs = codeparrot_tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

tokenized_datasets = tokenized_datasets.shuffle()
tokenized_datasets.set_format("pt")

tokenized_datasets





DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1397074
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 94778
    })
})

In [None]:
from torch.utils.data.dataloader import DataLoader

batch_size = 32

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=batch_size)

## Training with Accelerate

### Loading an untrained model
We can easily instantiate and untrained model by loading the config of a model architecture we want to use and then changing a few settings. Instead of `from_pretrained` we directly pass `config` to the model class.

In [31]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(codeparrot_tokenizer),
    n_ctx=context_length,
    bos_token_id=codeparrot_tokenizer.bos_token_id,
    eos_token_id=codeparrot_tokenizer.eos_token_id,
)

In [32]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 111.0M parameters


### Optimizer and LR schedule

We have to setup the Optimizer and the LR schedule, like we would for any PyTorch training script.

In [34]:
weight_decay = 0.1
lr = 5e-4

def get_grouped_params(model, no_decay=["bias", "ln_1.weight", "ln_2.weight", "ln_f.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=lr)

In [38]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

### Accelerator
The `Accelerator` is the class that takes care of all the distribution and synchronization. We just need to pass models, optimizers, and dataloaders through `prepare` to make sure they are properly distributed.

In [37]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

### Evaluation and perplexity

The following helper function will evaluate the model on the validation split of the dataset and return the perplexity. Perplexity is just the exponential of CE loss so easy to compute.

In [35]:
import torch


def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

### Sync with Hugging Face Hub

We'll want to sync the model checkpoints with the Hub in case the training script crashes and we want to resume training. We'll use the `Repository` class for this.

In [39]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "sphere-accelerate"
repo_name = get_full_repo_name(model_name)

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

print(repo_name)

/content/sphere-accelerate is already a clone of https://huggingface.co/lvwerra/sphere-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


lvwerra/sphere-accelerate


### Training loop

The training loop is almost pure PyTorch code with a few small exceptions:

- backward passes are handled by the `accelerator` and not the `model`
- there is also a dedicated `clip_grad_norm_` method for global gradient clipping
- since we are running in a distributed setting we have to synchronize workers sometimes. The `wait_for_everyone` method takes care of this.
- some things should only be executed on the main process (as pushing to the hub): the `is_main_process` attribute is only true on the main worker. Similar for the `save` and `print` method.

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0

for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        loss = model(batch["input_ids"], labels=batch["input_ids"]).loss
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        
        # some logging
        if step % 100 == 0:
            accelerator.print(
                {
                    "lr": optimizer.param_groups[0]["lr"],
                    "samples": step * batch_size * accelerator.state.num_processes,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        
        # run optimization
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        
        # run evaluation
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            
            if accelerator.is_main_process:
                codeparrot_tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )