## Finetune out-of-the-box GPT2 for Question Answering

##### Prerequisites

In [2]:
%%capture

!pip install --upgrade jupyter
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [3]:
%%capture

!pip install transformers==4.18.0
!pip install datasets==2.9.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install wandb==0.13.9
!pip install torch==1.8.1

#### Imports 

In [4]:
from transformers import TrainingArguments
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import Trainer
import transformers 
import numpy as np
import datasets
import logging
import torch
import wandb
import os

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using wandb version: {wandb.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using datasets version: 2.9.0]
[Using torch version: 1.8.1+cu102]
[Using wandb version: 0.13.9]
[Using numpy version: 1.22.2]


##### Setup essentials 

In [7]:
TRAIN_EPOCHS = 4
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
MAX_LEN = 512
LOGGING_STEPS = 64
SAVE_STEPS = 10240  # reduce it to a smaler value like 512 if you want to save checkpoints
SAVE_TOTAL_LIMIT = 2

BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'

In [8]:
!wandb login 8489739d838b89d2f424147f354f9db40517c1c9

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [9]:
path = os.path.abspath('01-finetune-oob-gpt2.ipynb')
path

'/root/how-to-train-faq-chatbot-from-scratch/02-finetune/01-finetune-oob-gpt2.ipynb'

In [10]:
os.environ['WANDB_NOTEBOOK_NAME'] = path

#### Load tokenized dataset 

In [11]:
reloaded_dataset = datasets.load_from_disk('.././01-tokenize/data/tokenized-oob')
reloaded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1944
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 217
    })
})

#### Re-load custom tokenizer 

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                          bos_token=BOS_TOKEN,
                                          eos_token=EOS_TOKEN, 
                                          pad_token=PAD_TOKEN,
                                          lower=True,
                                          return_tensors='pt')
tokenizer.padding_side = 'left'
tokenizer.model_max_length = MAX_LEN
logger.info(f'Tokenizer: {tokenizer}')

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizer: PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Re-load OOB model

In [13]:
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Embedding(50259, 768)

In [14]:
def custom_data_collator(batch):
    # batch size for data collation = per_device_train_batch_size * number of GPUs
    input_ids = torch.stack([example['input_ids'] for example in batch])
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    labels = torch.stack([example['labels'] for example in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [15]:
training_args = TrainingArguments(output_dir='./model/oob-finetuned', 
                                  overwrite_output_dir=True, 
                                  num_train_epochs=TRAIN_EPOCHS,  
                                  optim='adamw_torch', 
                                  save_strategy='steps', 
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=TRAIN_BATCH_SIZE, 
                                  per_device_eval_batch_size=EVAL_BATCH_SIZE, 
                                  warmup_steps=10, 
                                  weight_decay=0.01,
                                  logging_steps=LOGGING_STEPS,
                                  save_steps=SAVE_STEPS, 
                                  save_total_limit=SAVE_TOTAL_LIMIT,
                                  report_to='wandb',
                                  logging_dir='logs')

In [16]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=reloaded_dataset['train'], 
                  eval_dataset=reloaded_dataset['validation'], 
                  data_collator=custom_data_collator)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 1944
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 488
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mshankar-arunp[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,1.6338,0.536929
2,0.543,0.482625
3,0.4711,0.46198
4,0.4491,0.459239


***** Running Evaluation *****
  Num examples = 217
  Batch size = 4
***** Running Evaluation *****
  Num examples = 217
  Batch size = 4
***** Running Evaluation *****
  Num examples = 217
  Batch size = 4
***** Running Evaluation *****
  Num examples = 217
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=488, training_loss=0.6551714451586614, metrics={'train_runtime': 574.9315, 'train_samples_per_second': 13.525, 'train_steps_per_second': 0.849, 'total_flos': 2031806840832000.0, 'train_loss': 0.6551714451586614, 'epoch': 4.0})

#### Save finetuned model to local

In [18]:
trainer.save_model('./model/oob-finetuned')

Saving model checkpoint to ./model/oob-finetuned
Configuration saved in ./model/oob-finetuned/config.json
Model weights saved in ./model/oob-finetuned/pytorch_model.bin
