## Data Preparation - Tokenize FAQ dataset using out-of-the-box GPT2 default tokenizer

##### Prerequisites

In [None]:
%%capture

!pip install --upgrade jupyter
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
%%capture

!pip install transformers==4.18.0
!pip install datasets==2.9.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1

#### Imports 

In [2]:
from transformers import GPT2Tokenizer
from transformers import set_seed
from datasets import load_dataset
from datasets import DatasetDict
import transformers
import pandas as pd
import numpy as np
import datasets 
import logging
import torch
import os

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using pandas version: {pd.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using datasets version: 2.9.0]
[Using torch version: 1.8.1+cu102]
[Using pandas version: 1.4.1]
[Using numpy version: 1.22.2]


#### Setup essentials 

In [5]:
np.random.seed(123)
set_seed(123)

In [6]:
MAX_LEN = 512
N_GPUS = 1
num_proc = int(os.cpu_count()/N_GPUS)
num_proc

4

In [7]:
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'

#### Load FAQ dataset 

In [8]:
data = load_dataset('csv', 
                     data_files='./data/faq_train.csv', 
                     column_names=['question', 'answer'], 
                     delimiter=',', 
                     split='train', 
                     #download_mode='force_redownload',
                     cache_dir='/tmp/cache')

Using custom data configuration default-8350c4ed807e1a04
Found cached dataset csv (/tmp/cache/csv/default-8350c4ed807e1a04/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [9]:
logger.info(f'Loaded dataset: {data}')

Loaded dataset: Dataset({
    features: ['question', 'answer'],
    num_rows: 2161
})


#### Create data splits 

In [10]:
train_validation_test = data.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': train_validation_test['test']})
logger.info(f'Data splits: {data_splits}')

Loading cached split indices for dataset at /tmp/cache/csv/default-8350c4ed807e1a04/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e96185409d9f7b12.arrow and /tmp/cache/csv/default-8350c4ed807e1a04/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e193bc8043edb0fa.arrow
Data splits: DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1944
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 217
    })
})


#### Setup the custom GPT2 tokenizer 

Re-create the custom tokenizer we created in the previous medium article "[Easily Build Your Own GPT from Scratch using AWS: A Comprehensive Guide for Domain Adaptation](https://medium.com/@shankar.arunp/easily-build-your-own-gpt-from-scratch-using-aws-51811b6355d3)"

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 
                                          bos_token=BOS_TOKEN,
                                          eos_token=EOS_TOKEN, 
                                          pad_token=PAD_TOKEN, 
                                          return_tensors='pt')
tokenizer.padding_side = 'left'
tokenizer.model_max_length = MAX_LEN
logger.info(f'Tokenizer: {tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizer: PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Tokenize data splits 

In [12]:
def tokenize(samples: list):
    questions = samples['question']
    answers = samples['answer']
    logger.info(f'Tokenizing QA pairs of length = {len(questions)}')
    
    input_ids = []
    attention_mask = []
    labels = []
    
    for question, answer in zip(questions, answers):
        prompted_input = f'{BOS_TOKEN}question: {question}{PAD_TOKEN}answer: {answer}{EOS_TOKEN}'
        tokenized_input = tokenizer(prompted_input, 
                                    truncation=True, 
                                    max_length=MAX_LEN, 
                                    padding='max_length')
        input_ids.append(torch.tensor(tokenized_input['input_ids'], dtype=torch.long))
        attention_mask.append(torch.tensor(tokenized_input['attention_mask']))
        labels.append(torch.tensor(tokenized_input['input_ids']))

    return {'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels}

In [13]:
tokenized_data = data_splits.map(tokenize, batched=True, 
                                 num_proc=num_proc, 
                                 #load_from_cache_file=False)
                                 remove_columns=['question', 'answer'])
tokenized_data.set_format('pt', 
                          columns=['input_ids', 'attention_mask', 'labels'], 
                          output_all_columns=True)
logger.info(f'Tokenized data = {tokenized_data}')

      

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing QA pairs of length = 486
Tokenizing QA pairs of length = 486


 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing QA pairs of length = 486


#3:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing QA pairs of length = 486


       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing QA pairs of length = 55


#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing QA pairs of length = 54


 

Tokenizing QA pairs of length = 54


#1:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenizing QA pairs of length = 54
Tokenized data = DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1944
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 217
    })
})


#### Save tokenized data splits to local

In [14]:
tokenized_data.save_to_disk('./data/tokenized-oob')

Saving the dataset (0/1 shards):   0%|          | 0/1944 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/217 [00:00<?, ? examples/s]