##### Prerequisites

In [None]:
%%capture 

!pip install torch==1.12.1+cu113
!pip install transformers==4.21.0
!pip install datasets==2.9.0
!pip install wandb==0.13.10

#### Imports 

In [3]:
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoTokenizer
from datasets import load_from_disk
from transformers import Trainer
import transformers 
import datasets 
import logging
import torch
import wandb
import os

In [4]:
torch.cuda.empty_cache()

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using wandb version: {wandb.__version__}]')

[Using transformers version: 4.21.0]
[Using datasets version: 2.9.0]
[Using torch version: 1.12.1+cu113]
[Using wandb version: 0.13.10]


##### Setup wandb logging

In [7]:
!wandb login 8489739d838b89d2f424147f354f9db40517c1c9

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
path = os.path.abspath('01-finetune.ipynb')
os.environ['WANDB_NOTEBOOK_NAME'] = path

#### Load dataset

In [9]:
%%time 

dataset = load_from_disk('./../01-prepare/data/tokenized')
logger.info(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 93171
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 10353
    })
})


CPU times: user 1.49 s, sys: 256 ms, total: 1.75 s
Wall time: 1.77 s


In [10]:
def custom_data_collator(batch):
    # batch size for data collation = per_device_train_batch_size * number of GPUs
    input_ids = torch.stack([torch.LongTensor(example['input_ids']) for example in batch])
    attention_mask = torch.stack([torch.LongTensor(example['token_type_ids']) for example in batch])
    labels = torch.stack([torch.LongTensor(example['labels']) for example in batch])
    return {'input_ids': input_ids, 'token_type_ids': attention_mask, 'labels': labels}

#### Load GPT-Neo Tokenizer 

In [11]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
logger.info(tokenizer)

PreTrainedTokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_len=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})


In [12]:
special_tokens = {
    'bos_token': '<|startoftext|>',
    'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']
}

In [13]:
_ = tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()

In [14]:
logger.info(tokenizer)

PreTrainedTokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_len=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']})


#### Load GPT-Neo model

In [15]:
%%time

model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')
model.resize_token_embeddings(len(vocab))
device = torch.device('cuda')
model.to(device)
logger.info(next(model.parameters()).device)

cuda:0


CPU times: user 17.7 s, sys: 5.65 s, total: 23.4 s
Wall time: 9.17 s


#### Setup training config

In [16]:
TRAIN_EPOCHS = 5
TRAIN_BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
LOGGING_STEPS = 64
SAVE_STEPS = 10240  # Reduce it to a smaler value like 512 if you want to save checkpoints
SAVE_TOTAL_LIMIT = 2

In [17]:
training_args = TrainingArguments(output_dir='./model', 
                                  overwrite_output_dir=True, 
                                  num_train_epochs=TRAIN_EPOCHS,  
                                  optim='adamw_torch', 
                                  save_strategy='steps', 
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=TRAIN_BATCH_SIZE, 
                                  per_device_eval_batch_size=EVAL_BATCH_SIZE, 
                                  warmup_steps=10, 
                                  weight_decay=0.1,
                                  logging_steps=LOGGING_STEPS,
                                  save_steps=SAVE_STEPS, 
                                  save_total_limit=SAVE_TOTAL_LIMIT,
                                  report_to='wandb',
                                  logging_dir='logs')

#### Train

In [18]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=dataset['train'], 
                  eval_dataset=dataset['validation'], 
                  data_collator=custom_data_collator)

In [19]:
%%time 

trainer.train()

***** Running training *****
  Num examples = 93171
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 14560
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mshankar-arunp[0m. Use [1m`wandb login --relogin`[0m to force relogin


[2023-02-20 14:24:15.660 pytorch-1-12-gpu--ml-p3dn-24xlarge-307ebad80d11874f5dcc2ce687db:59073 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-02-20 14:24:15.785 pytorch-1-12-gpu--ml-p3dn-24xlarge-307ebad80d11874f5dcc2ce687db:59073 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.




Epoch,Training Loss,Validation Loss
1,0.1734,0.173809
2,0.1628,0.169528
3,0.1514,0.168145
4,0.1482,0.168529
5,0.145,0.169217


***** Running Evaluation *****
  Num examples = 10353
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10353
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10353
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-10240
Configuration saved in ./model/checkpoint-10240/config.json
Model weights saved in ./model/checkpoint-10240/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10353
  Batch size = 32
***** Running Evaluation *****
  Num examples = 10353
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 3h 43min 11s, sys: 1h 28min 36s, total: 5h 11min 48s
Wall time: 2h 34min 4s


TrainOutput(global_step=14560, training_loss=0.17299925432755398, metrics={'train_runtime': 9243.9629, 'train_samples_per_second': 50.396, 'train_steps_per_second': 1.575, 'total_flos': 6.702161390226432e+16, 'train_loss': 0.17299925432755398, 'epoch': 5.0})

#### Save model 

In [20]:
trainer.save_model('./model')

Saving model checkpoint to ./model
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
