# Pretrain a GPT2 model using Huggingface `Trainer`

# 1. Setup Environment

## 1.1 Install Libraries

In [None]:
! pip install --upgrade transformers datasets tensorboard > install.log

## 1.2 Load Dependencies

In [11]:
from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import datasets
import huggingface_hub
import os

## 1.3 Connect to Online Services

Here you have to connect this notebook to online services like [Huggingface](https://hf.co) or [Weights & Biases](https://wandb.ai)

In [9]:
# Huggingface
HF_TOKEN = "Paste-your-token"

huggingface_hub.login(token=HF_TOKEN)

## 1.4 Cache Directory

Location of downloaded models, tokenizers and dataset

In [None]:
CACHE = os.path.join(os.getcwd(), "hf_cache")

# 2. Tokenizer & Model

## 2.1 Get the GPT2 Tokenizer

The scope of this notebook does not include training a Tokenizer and hence we will use a `PretrainedTokenizer` provided by [OpenAI-Community](https://huggingface.co/openai-community).

In [None]:
HF_TOKENIZER_ID = "openai-community/gpt2"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(HF_TOKENIZER_ID, cache_dir=CACHE)

## 2.2 Instantiate Model

Here you have the choice either to __train a GPT2 model from scratch__ or to further train an __existing GPT2 model__  using your own data. This notebook will cover both cases but when you will start the training process, you need to choose either one of the model.

### 2.2.1 Scratch Model

In [1]:
def scratch_model(vocab_size: int, context_length: int, embedding_dim=768, encoder_layers=12, attention_heads=12):
    """
    Instantiate GPT2Model from scratch.

    Arguments:
    ----------
    - `vocab_size`: Total unique tokens the model can understand.
    - `context_length`: Maximum length of the input sequences.
    - `embedding_dim`: Dimensionality of Word Embeddings
    - `encoder_layers`: Transformer Encoder Layers
    - `attention_heads`: Number of Parallel Attention mechanisms in each Encoder Layer.
    """
    # configuartion for the model
    config = GPT2Config(
        vocab_size=vocab_size,
        n_positions=context_length,
        n_embd=embedding_dim,
        n_layer=encoder_layers,
        n_head=attention_heads
    )

    # instanciate model
    model = GPT2LMHeadModel(config)

    return model

### 2.2.2 Pretrained Model

In [None]:
def pretrained_model(hf_model_id="openai-community/gpt2"):
    """
    Download a pretrained model from Huggingface.

    Arguments:
    ----------
    - `hf_model_id`: repository id for the pretrained model.
    """
    # download model
    model = GPT2LMHeadModel.from_pretrained(hf_model_id, cache_dir=CACHE, token=HF_TOKEN)

    return model

# 3. Dataset

## 3.1 Download Dataset

In [None]:
DATASET_ID = "HuggingFaceFW/fineweb"
SUBSET = "default"
TRAINING_SPLIT = "train"
TEXT_KEY = "text"

In [None]:
train_dataset = datasets.load_dataset(
    DATASET_ID,
    SUBSET,
    split=TRAINING_SPLIT,
    streaming=True
)

## 3.2 Tokenize the Dataset

In [None]:
def tokenize(example, _tokenizer=tokenizer):
    """
    Tokenizes mini batches sampled from the dataset.
    """
    tokenized_inputs = _tokenizer(
        example[TEXT_KEY],
        padding="right",
        return_tensors="pt"
    )
    return tokenized_inputs

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True).select_columns(["input_ids", "attention_mask"])

# 4. Setup `Trainer`

## 4.1 Hyperparameters

In [6]:
# Simple
TRAINED_MODEL_NAME = "gpt2-fineweb"
BATCH_SIZE = 2
LEARNING_RATE = 1e-5
MAX_TRAIN_STEPS = 10000
EVALUATION_STEPS = MAX_TRAIN_STEPS // 10
SAVE_STEPS = EVALUATION_STEPS // 4
SEED = 854

# Advanced (Optional)
GRADIENT_ACCUMULATION_STEPS = 1
WEIGHT_DECAY = 0
LEARNING_RATE_SCHEDULER = "constant"
WARMUP_RATIO = 0.01

## 4.2 Training Arguments

In [10]:
args = TrainingArguments(
    output_dir=TRAINED_MODEL_NAME,
    overwrite_output_dir=True,
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=EVALUATION_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    max_steps=MAX_TRAIN_STEPS,
    push_to_hub=True,
    hub_token=HF_TOKEN,
    per_device_train_batch_size=BATCH_SIZE,
    per_gpu_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type=LEARNING_RATE_SCHEDULER,
    seed=SEED,
    report_to=["tensorboard"]
)

## 4.3 Trainer

In [None]:
# get the model
model = scratch_model(vocab_size=tokenizer.vocab_size, context_length=1024)

#  data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=args,
    train_dataset=train_dataset
)

## 4.4 Tensorboard

In [None]:
%reload_ext tensorboard
%tensorboard --logdir {TRAINED_MODEL_NAME}

# 5. Start Training

In [None]:
_ = trainer.train()

In [None]:
trainer.push_to_hub(
token=HF_TOKEN,
tags = ["pytorch", trainer.model.__class__.__name__]
)