# Pretrain BigBird on GENCODE

This notebook uses the `transformers` library to pretrain the BigBird architecture on GENCODE using MLM loss. Adapted from Hugging Face script https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py.

In [1]:
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace

from transformers import PreTrainedTokenizerFast
from transformers import AutoConfig
from transformers import BigBirdForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments
from transformers import pipeline

import torch
import math

In [2]:
!nvidia-smi

Fri Nov  5 18:09:55 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   40C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [3]:
torch.cuda.is_available()

True

## Train the tokenizer

In [4]:
paths = ["./data/6mers_train_tiny.txt", "./data/6mers_test_tiny.txt"]

# Initialize a tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Customize training
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files=paths, trainer=trainer)

# Convert to PretrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    mask_token="[MASK]",
)

In [5]:
len(tokenizer.get_vocab())

1524

## Train language model from scratch

In [6]:
model_checkpoint = "google/bigbird-roberta-base"
config = AutoConfig.from_pretrained(model_checkpoint)
model = BigBirdForMaskedLM(config=config)

In [7]:
model.num_parameters()

128111286

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [9]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=paths[0],
    block_size=128,
)

eval_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=paths[1],
    block_size=128,
)



In [10]:
output_model_dir = "./output_model"
output_tokenizer_dir = "./output_tokenizer"

training_args = TrainingArguments(
    output_dir=output_model_dir,
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [11]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 4
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 50
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Attention type 'block_sparse' is not possible if sequence_length: 128 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




Attempted to log scalar metric train_runtime:
17.1217
Attempted to log scalar metric train_samples_per_second:
11.681
Attempted to log scalar metric train_steps_per_second:
2.92
Attempted to log scalar metric total_flos:
13254002380800.0
Attempted to log scalar metric train_loss:
9.804313354492187
Attempted to log scalar metric epoch:
50.0
CPU times: user 15.5 s, sys: 1.58 s, total: 17.1 s
Wall time: 17.1 s


TrainOutput(global_step=50, training_loss=9.804313354492187, metrics={'train_runtime': 17.1217, 'train_samples_per_second': 11.681, 'train_steps_per_second': 2.92, 'total_flos': 13254002380800.0, 'train_loss': 9.804313354492187, 'epoch': 50.0})

In [12]:
trainer.save_model(output_model_dir)
tokenizer.save_pretrained(output_tokenizer_dir)

Saving model checkpoint to ./output_model
Configuration saved in ./output_model/config.json
Model weights saved in ./output_model/pytorch_model.bin
tokenizer config file saved in ./output_tokenizer/tokenizer_config.json
Special tokens file saved in ./output_tokenizer/special_tokens_map.json


('./output_tokenizer/tokenizer_config.json',
 './output_tokenizer/special_tokens_map.json',
 './output_tokenizer/tokenizer.json')

# Validate training

In [13]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 4
  Batch size = 8


Attempted to log scalar metric eval_loss:
10.796552658081055
Attempted to log scalar metric eval_runtime:
0.0958
Attempted to log scalar metric eval_samples_per_second:
41.763
Attempted to log scalar metric eval_steps_per_second:
10.441
Attempted to log scalar metric epoch:
50.0
Perplexity: 48852.10


In [14]:
fill_mask = pipeline(
    "fill-mask",
    model=output_model_dir,
    tokenizer=output_tokenizer_dir,
)

loading configuration file ./output_model/config.json
Model config BigBirdConfig {
  "architectures": [
    "BigBirdForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 66,
  "torch_dtype": "float32",
  "transformers_version": "4.9.0",
  "type_vocab_size": 2,
  "use_bias": true,
  "use_cache": true,
  "vocab_size": 50358
}

loading configuration file ./output_model/config.json
Model config BigBirdConfig {
  "architectures": [
    "BigBird

In [15]:
fill_mask("TTAGTT TAGTTT [MASK]")

Attention type 'block_sparse' is not possible if sequence_length: 3 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3.Changing attention type to 'original_full'...


[{'sequence': 'TTAGTT TAGTTT TGGAAG',
  'score': 0.0002077565441140905,
  'token': 13,
  'token_str': 'TGGAAG'},
 {'sequence': 'TTAGTT TAGTTT AGGAAC',
  'score': 0.0001975743507500738,
  'token': 1111,
  'token_str': 'AGGAAC'},
 {'sequence': 'TTAGTT TAGTTT GTGGCC',
  'score': 0.00018981598259415478,
  'token': 796,
  'token_str': 'GTGGCC'},
 {'sequence': 'TTAGTT TAGTTT TCTTCC',
  'score': 0.00018416137027088553,
  'token': 172,
  'token_str': 'TCTTCC'},
 {'sequence': 'TTAGTT TAGTTT TAAGAG',
  'score': 0.00018064815958496183,
  'token': 625,
  'token_str': 'TAAGAG'}]