## Data Preparation

##### Prerequisites

In [None]:
%%capture

!pip install --upgrade jupyter
!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
%%capture

!pip install transformers==4.18.0
!pip install datasets==2.9.0
!pip install pandas==1.4.1
!pip install numpy==1.22.2
!pip install torch==1.8.1

#### Imports 

In [4]:
from torch.utils.data import Dataset
from transformers import set_seed
from datasets import load_dataset
from datasets import DatasetDict

from transformers import GPT2Tokenizer
from transformers import GPT2TokenizerFast

import pandas as pd
import numpy as np
import transformers
import datasets 
import logging
import torch
import os

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')
logger.info(f'[Using pandas version: {pd.__version__}]')
logger.info(f'[Using numpy version: {np.__version__}]')

[Using transformers version: 4.18.0]
[Using datasets version: 2.9.0]
[Using torch version: 1.8.1+cu102]
[Using pandas version: 1.4.1]
[Using numpy version: 1.22.2]


#### Setup essentials 

In [7]:
np.random.seed(123)
set_seed(123)


N_GPUS = 1
num_proc = int(os.cpu_count()/N_GPUS)
num_proc

4

#### Load FAQ dataset 

In [8]:
data = load_dataset('csv', 
                     data_files='./data/faq_train.csv', 
                     column_names=['question', 'answer'], 
                     delimiter=',', 
                     split='train', 
                     cache_dir='/tmp/cache')

Using custom data configuration default-bf258d3d4adb68a9
Found cached dataset csv (/tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [9]:
logger.info(f'Loaded dataset: {data}')

Loaded dataset: Dataset({
    features: ['question', 'answer'],
    num_rows: 6787
})


#### Create data splits 

In [10]:
train_validation_test = data.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': train_validation_test['test']})
logger.info(f'Data splits: {data_splits}')

Loading cached split indices for dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d8a4dfe1e0f06e33.arrow and /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4346dc9584e850e1.arrow
Data splits: DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 6108
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 679
    })
})


#### Setup the custom GPT2 tokenizer 

Re-create the custom tokenizer we created in the previous medium article "[Easily Build Your Own GPT from Scratch using AWS: A Comprehensive Guide for Domain Adaptation](https://medium.com/@shankar.arunp/easily-build-your-own-gpt-from-scratch-using-aws-51811b6355d3)"

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained('./vocab', 
                                          bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', 
                                          pad_token='<|pad|>')
tokenizer.padding_side = 'left'
tokenizer.model_max_length = 512
logger.info(f'Tokenizer: {tokenizer}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizer: PreTrainedTokenizer(name_or_path='./vocab', vocab_size=50257, model_max_len=512, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})


#### Tokenize data splits 

In [12]:
def tokenize(samples: list):
    questions = samples['question']
    answers = samples['answer']
    logger.info(f'Tokenizing QA pairs of length = {len(questions)}')
    
    input_ids = []
    attention_mask = []
    labels = []
    
    for question, answer in zip(questions, answers):
        prompted_input = f'<|startoftext|>question: {question}<|pad|>answer: {answer}<|endoftext|>'
        tokenized_input = tokenizer(prompted_input, 
                                    truncation=True, 
                                    max_length=512, 
                                    padding='max_length')
        input_ids.append(tokenized_input['input_ids'])
        attention_mask.append(tokenized_input['attention_mask'])
        labels.append(tokenized_input['input_ids'])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [13]:
tokenized_data = data_splits.map(tokenize, batched=True, num_proc=num_proc, remove_columns=['question', 'answer'])
logger.info(f'Tokenized data = {tokenized_data}')

 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-68a378a2ec2bb870.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4997047fdd268497.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-39ed3ccb0d8982c8.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-7cb657e394cef021.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f6cccc1bba294e26.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-984de698eae1c444.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-24afff6aeb609281.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-bf258d3d4adb68a9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-68457a2c2b016864.arrow
Tokenized data = DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6108
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 679
    })
})


#### Save tokenized data splits to local

In [16]:
tokenized_data.save_to_disk('./data')

Saving the dataset (0/1 shards):   0%|          | 0/6108 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/679 [00:00<?, ? examples/s]