##### Prerequisites

In [51]:
%%capture

!pip install transformers
!pip install datasets

#### Imports 

In [2]:
from transformers import GPT2ForSequenceClassification
from transformers import GPT2TokenizerFast
from transformers import GPT2Tokenizer
from transformers import GPT2Config
from transformers import GPT2Model
import transformers 
import datasets
import logging
import torch

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')

[Using transformers version: 4.18.0]
[Using transformers version: 4.18.0]


2023-01-21 20:18:19,799 - sagemaker - INFO - [Using transformers version: 4.18.0]


[Using datasets version: 2.4.0]
[Using datasets version: 2.4.0]


2023-01-21 20:18:19,801 - sagemaker - INFO - [Using datasets version: 2.4.0]


[Using torch version: 1.8.1]
[Using torch version: 1.8.1]


2023-01-21 20:18:19,802 - sagemaker - INFO - [Using torch version: 1.8.1]


#### Explore the config 
https://huggingface.co/docs/transformers/model_doc/gpt2

In [None]:
configuration = GPT2Config()
# Initializing a model (with random weights) from the configuration
model = GPT2Model(configuration)
# Accessing the model configuration
configuration = model.config
configuration 

In [None]:
model

#### Explore the tokenizer 
Type of tokenization = byte-level byte pair encoding

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer

In [None]:
tokenizer('Happy friday')

In [None]:
tokenizer('I truly believe in eternal love!')

In [None]:
tokenizer.save_vocabulary('./data/')

#### Explore tokenizer fast

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer

In [None]:
tokenizer('Happy friday')

In [None]:
tokenizer('I truly believe in eternal love!')

#### Explore the model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # using the default tokenizer
model = GPT2Model.from_pretrained('gpt2')

In [None]:
inputs = tokenizer('Happy friday', return_tensors='pt')
inputs 

In [None]:
outputs = model(**inputs)
outputs

In [None]:
last_hidden_states = outputs.last_hidden_state
last_hidden_states

#### Get number of attention heads 

In [None]:
# Get the number of heads in the model
num_heads = model.config.num_attention_heads
num_heads

#### Text classification

The tokenizer from the "microsoft/DialogRPT-updown" checkpoint is fine-tuned by Microsoft on conversational data and it is trained to work with the DialogRPT-updown model which is a conversational language model.

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialogRPT-updown')
tokenizer

In [None]:
model = GPT2ForSequenceClassification.from_pretrained('microsoft/DialogRPT-updown')

In [None]:
inputs = tokenizer('Hello, my dog is cute', return_tensors='pt')
inputs

In this example, the model is evaluated with the inputs, but the gradients are not computed because we are inside the context of torch.no_grad().
This will save memory and computation time.

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits
logits

In [None]:
predicted_class_id = logits.argmax().item()
predicted_class_id

In [None]:
model.config.id2label[predicted_class_id]

In [None]:
num_labels = len(model.config.id2label)
num_labels

#### Notes:

* It’s a causal (unidirectional) transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.






### Code start here

In [None]:
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2TokenizerFast
from pathlib import Path
import transformers 
import tokenizers
import logging
import sys
import os


# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.getLevelName('INFO'), 
                    handlers=[logging.StreamHandler(sys.stdout)], 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Log versions of dependencies
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Tokenizers: {tokenizers.__version__}]')

In [None]:
default_tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
default_tokenizer

In [None]:
vocab_size = default_tokenizer.vocab_size
model_max_length = default_tokenizer.model_max_length

In [None]:
LOCAL_INPUT_PATH = './corpus/' 
# LOCAL_OUTPUT_PATH is mapped to S3 output location where we want to save the custom vocabulary after training the tokenizer
LOCAL_OUTPUT_PATH = './data1/'


# Read input files from local input path 
logger.info(f'Reading input files from [{LOCAL_INPUT_PATH}/]')
paths = [str(x) for x in Path(LOCAL_INPUT_PATH).glob('*.txt')]
print(paths)

# Train custom BertWordPiece tokenizer
logger.info(f'Training BytePair custom tokenizer using files in {paths}')
tokenizer = ByteLevelBPETokenizer(lowercase=True)
print(tokenizer)

tokenizer.train(files=paths, 
                vocab_size=vocab_size, 
                min_frequency=1, 
                special_tokens=['<|endoftext|>'])
tokenizer.enable_truncation(max_length=1024)
tokenizer

In [None]:
tokenizer.save_model(LOCAL_OUTPUT_PATH)

In [None]:
# Re-create custom tokenizer using vocab from local output path
logger.info(f'Re-create GPT2Tokenizer custom tokenizer using extracted custom vocab in {LOCAL_OUTPUT_PATH}')
custom_tokenizer = GPT2TokenizerFast.from_pretrained('./data1', pad_token='<|endoftext|>')
custom_tokenizer.model_max_length = 1024
print(custom_tokenizer)


# Evaluate custom tokenizer 
logger.info('Evaluating custom tokenizer')
test_sentence = 'covid virus in usa'
logger.info(f'Test sentence: {test_sentence}')
tokens = tokenizer.encode(test_sentence).tokens
logger.info(f'Encoded sentence: {tokens}')
token_id = tokenizer.token_to_id('covid')
logger.info(f'Token ID for token (covid) = {token_id}')
vocab_size = tokenizer.get_vocab_size()
logger.info(f'Vocabulary size = {vocab_size}')

#### Tokenize 10k articles using the custom tokenizer we build now 

In [20]:
from transformers import GPT2TokenizerFast
from transformers import GPT2Config
from datasets import load_dataset
from datasets import DatasetDict
from pathlib import Path
import transformers 
import datasets
import logging
import sys
import os


# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.getLevelName('INFO'), 
                    handlers=[logging.StreamHandler(sys.stdout)], 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Log versions of dependencies
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')

# Essentials
# LOCAL_INPUT_PATH is mapped to S3 input location for covid news articles 
LOCAL_INPUT_PATH = './corpus' 
# LOCAL_OUTPUT_PATH is mapped to S3 output location where we want to save the processed input data (COVID articles)
LOCAL_OUTPUT_PATH = '/tokenized'
MAX_LENGTH = 512
CHUNK_SIZE = 128
N_GPUS = 1

2023-01-21 19:28:34,795 - __main__ - INFO - [Using Transformers: 4.18.0]
2023-01-21 19:28:34,796 - __main__ - INFO - [Using Datasets: 2.4.0]


In [21]:
configuration = GPT2Config()

In [22]:
tokenizer = GPT2TokenizerFast.from_pretrained('./data1', pad_token='<|endoftext|>')
tokenizer.model_max_length = 512

In [23]:
tokenizer

PreTrainedTokenizerFast(name_or_path='./data1', vocab_size=50257, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'})

In [24]:
# Read dataset and collate to create mini batches for Causal Language Model (CLM) training
logger.info('Reading and collating input data to create mini batches for Causal Language Model (CLM) training')
dataset = load_dataset('text', data_files='./corpus/covid_articles.txt', split='train', cache_dir='/tmp/cache')
logger.info(f'Dataset: {dataset}')


2023-01-21 19:28:37,392 - __main__ - INFO - Reading and collating input data to create mini batches for Causal Language Model (CLM) training
2023-01-21 19:28:37,443 - __main__ - INFO - Dataset: Dataset({
    features: ['text'],
    num_rows: 10001
})


In [25]:
# Split dataset into train and validation splits 
logger.info('Splitting dataset into train and validation splits')
train_test_splits = dataset.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_test_splits['train'], 
                           'validation': train_test_splits['test']})
logger.info(f'Data splits: {data_splits}')

2023-01-21 19:28:39,239 - __main__ - INFO - Splitting dataset into train and validation splits
2023-01-21 19:28:39,244 - __main__ - INFO - Data splits: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1001
    })
})


In [26]:
# Tokenize dataset
def tokenize(article):
    tokenized_article = tokenizer(article['text'])
    if tokenizer.is_fast:
        tokenized_article['word_ids'] = [tokenized_article.word_ids(i) for i in range(len(tokenized_article['input_ids']))]
    return tokenized_article


logger.info('Tokenizing dataset splits')
num_proc = int(os.cpu_count()/N_GPUS)
logger.info(f'Total number of processes = {num_proc}')
tokenized_datasets = data_splits.map(tokenize, batched=True, num_proc=num_proc, remove_columns=['text'])
logger.info(f'Tokenized datasets: {tokenized_datasets}')


# Concat and chunk dataset 
def concat_and_chunk(articles):
    # Concatenate all texts
    concatenated_examples = {key: sum(articles[key], []) for key in articles.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(articles.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length//CHUNK_SIZE) * CHUNK_SIZE
    # Split by chunks of max_len
    chunked_articles = {key: [text[i : i+CHUNK_SIZE] for i in range(0, total_length, CHUNK_SIZE)] for key, text in concatenated_examples.items()}
    # Create a new labels column
    chunked_articles['labels'] = chunked_articles['input_ids'].copy()
    return chunked_articles

2023-01-21 19:28:40,532 - __main__ - INFO - Tokenizing dataset splits
2023-01-21 19:28:40,534 - __main__ - INFO - Total number of processes = 32
                                      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

       

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (676 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (833 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3255 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1982 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1400 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

                                     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (885 > 512). Running this sequence through the model will result in indexing errors


#14:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors

 




 

Token indices sequence length is longer than the specified maximum sequence length for this model (1550 > 512). Running this sequence through the model will result in indexing errorsToken indices sequence length is longer than the specified maximum sequence length for this model (1198 > 512). Running this sequence through the model will result in indexing errors

Token indices sequence length is longer than the specified maximum sequence length for this model (1171 > 512). Running this sequence through the model will result in indexing errors

 




 

Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors


  

Token indices sequence length is longer than the specified maximum sequence length for this model (1226 > 512). Running this sequence through the model will result in indexing errors

 




#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


  

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (975 > 512). Running this sequence through the model will result in indexing errors

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (753 > 512). Running this sequence through the model will result in indexing errors

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (837 > 512). Running this sequence through the model will result in indexing errors

Token indices sequence length is longer than the specified maximum sequence length for this model (1499 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errorsToken indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (1006 > 512). Running this sequence through the model will result in indexing errorsToken indices sequence length is longer than the specified maximum sequence length for 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (676 > 512). Running this sequence through the model will result in indexing errors


#27:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (744 > 512). Running this sequence through the model will result in indexing errors


#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1438 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1587 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (918 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (957 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for 

2023-01-21 19:28:45,108 - __main__ - INFO - Tokenized datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 9000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1001
    })
})


In [27]:
logger.info('Concatenating and chunking the datasets to a fixed length')
chunked_datasets = tokenized_datasets.map(concat_and_chunk, batched=True, num_proc=num_proc)
logger.info(f'Chunked datasets: {chunked_datasets}')

2023-01-21 19:33:35,430 - __main__ - INFO - Concatenating and chunking the datasets to a fixed length
                                     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

                                     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

2023-01-21 19:33:42,067 - __main__ - INFO - Chunked datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 76994
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 8127
    })
})


In [28]:
chunked_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 76994
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 8127
    })
})

In [29]:
# Save chunked datasets to local disk (EBS volume)
logger.info(f'Saving chunked datasets to local disk {LOCAL_OUTPUT_PATH}')
chunked_datasets.save_to_disk('./tokenized')

# Validate if datasets were saved correctly
logger.info('Validating if datasets were saved correctly')
reloaded_dataset = datasets.load_from_disk('./tokenized')
logger.info(f'Reloaded dataset: {reloaded_dataset}')

2023-01-21 19:43:02,373 - __main__ - INFO - Saving chunked datasets to local disk /tokenized
2023-01-21 19:43:03,848 - __main__ - INFO - Validating if datasets were saved correctly
2023-01-21 19:43:05,537 - __main__ - INFO - Reloaded dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 76994
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 8127
    })
})




#### alternate

In [34]:
context_length = 128

In [35]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = data_splits.map(
    tokenize, batched=True, remove_columns=data_splits["train"].column_names
)
tokenized_datasets

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 72555
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 7630
    })
})

### train

In [36]:
len(tokenizer)

50257

In [37]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 128,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [38]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [39]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [40]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [43]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./model",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [44]:
trainer.train()

***** Running training *****
  Num examples = 72555
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 283


[2023-01-21 19:55:38.393 pytorch-1-8-gpu-py36-ml-g5-8xlarge-59efe6ee5124ceafd0c624e661a4:1013 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-01-21 19:55:38.422 pytorch-1-8-gpu-py36-ml-g5-8xlarge-59efe6ee5124ceafd0c624e661a4:1013 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=283, training_loss=7.269319891508392, metrics={'train_runtime': 449.7699, 'train_samples_per_second': 161.316, 'train_steps_per_second': 0.629, 'total_flos': 4732521283584000.0, 'train_loss': 7.269319891508392, 'epoch': 1.0})

In [63]:
trainer.save_model('./model')

Saving model checkpoint to ./model
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json


#### Inference 

In [53]:
torch.cuda.is_available()

True

In [68]:
import torch
from transformers import pipeline


pipe = pipeline(
    "text-generation", model="gpt2")

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {


In [69]:
pipe

<transformers.pipelines.text_generation.TextGenerationPipeline at 0x7fb6dbdb26a0>

In [70]:
txt = 'Tokyo is the capital of'
print(pipe(txt, num_return_sequences=1))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Tokyo is the capital of Japan, making them the second most densely populated nation in India, a top four destination for mobile phone users.\n\nAs to the problem of the phone's not working for them, according to some analysts, the problem"}]


### Creating a custom pipe 

In [71]:
pipe = pipeline(
    "text-generation", model="./model")

loading configuration file ./model/config.json
Model config GPT2Config {
  "_name_or_path": "./model",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 128,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading configuration file 

In [66]:
txt = 'Covid medicine'
print(pipe(txt, num_return_sequences=1))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[{'generated_text': 'Covid medicine, which has been more than 7.5.4% in the covid-19 cases and that the most of $ 300 million.4 million. the covid-19 has, and the federal ministry of the world to our global health'}]


##### How to fine-tune --- for downstream tasks 

#### Direct inference

In [73]:
# import
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# model name
model_name = "gpt2"

# load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).cuda()

# create prompt
prompt = """
Below are some examples for sentiment detection of movie reviews.

Review: I am sad that the hero died.
Sentiment: Negative

Review: The ending was perfect.
Sentiment: Positive

Review: The plot was not so good!
Sentiment:"""

# generate tokens
generated = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

# perform prediction 
sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=512, top_p=0.90, 
        temperature=0, num_return_sequences=0)

# decode the predicted tokens into texts
predicted_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
print(predicted_text)
"""Output --> 
Below are some examples for sentiment detection of movie reviews.

Review: I am sad that the hero died.
Sentiment: Negative

Review: The ending was perfect.
Sentiment: Positive

Review: The plot was not so good!
Sentiment: Negative
"""


loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb1993821


Below are some examples for sentiment detection of movie reviews.

Review: I am sad that the hero died.
Sentiment: Negative

Review: The ending was perfect.
Sentiment: Positive

Review: The plot was not so good!
Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The ending was not so good!

Sentiment: Negative

Review: The 

'Output --> \nBelow are some examples for sentiment detection of movie reviews.\n\nReview: I am sad that the hero died.\nSentiment: Negative\n\nReview: The ending was perfect.\nSentiment: Positive\n\nReview: The plot was not so good!\nSentiment: Negative\n'