##### Prerequisites

In [None]:
%%capture

!pip install --upgrade jupyter
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
%%capture

!pip install transformers==4.18.0
!pip install datasets==2.9.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1

#### Imports 

In [None]:
from transformers import TrainingArguments
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import Trainer
import transformers 
import numpy as np
import datasets
import logging 
import torch

##### Setup logging

In [None]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [None]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

##### Setup essentials 

In [None]:
SAVE_TOTAL_LIMIT = 2
SAVE_STEPS = 10000
TRAIN_EPOCHS = 10
BATCH_SIZE = 8

#### Load tokenized dataset 

In [None]:
reloaded_dataset = datasets.load_from_disk('.././01-tokenize/data/tokenized')
reloaded_dataset

In [None]:
reloaded_dataset['train']

In [None]:
len(reloaded_dataset['train']['input_ids'])

#### Re-load custom tokenizer 

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('.././01-tokenize/vocab', 
                                          bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', 
                                          pad_token='<|pad|>', return_tensors='pt')
tokenizer.padding_side = 'left'
tokenizer.model_max_length = 512
logger.info(f'Tokenizer: {tokenizer}')

#### Re-load custom model from HuggingFace Hub

In [None]:
model = GPT2LMHeadModel.from_pretrained('arun-shankar/GPT-2-covid-news-articles').cuda()
model.resize_token_embeddings(len(tokenizer))

In [None]:
def custom_data_collator(batch):
    # batch size for data collation = per_device_train_batch_size * number of GPUs
    input_ids = torch.stack([example['input_ids'] for example in batch])
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    labels = torch.stack([example['labels'] for example in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [None]:
training_args = TrainingArguments(output_dir='./model/custom-finetuned', 
                                  num_train_epochs=3,  
                                  optim='adamw_torch', 
                                  save_strategy='steps', 
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=4, 
                                  per_device_eval_batch_size=4, 
                                  warmup_steps=10, 
                                  weight_decay=0.01, 
                                  logging_dir='logs')


In [None]:
Trainer(model=model, 
        args=training_args, 
        train_dataset=reloaded_dataset['train'], 
        eval_dataset=reloaded_dataset['validation'],
        data_collator=custom_data_collator).train()