##### Prerequisites

In [None]:
%%capture

!pip install torch==1.12.1+cu113
!pip install transformers==4.26.1
!pip install datasets==2.9.0
!pip install pandas==1.5.3

#### Imports 

In [3]:
from transformers import AutoTokenizer
from datasets import DatasetDict
from itertools import chain
from tqdm import tqdm
import pandas as pd
import transformers 
import numpy as np
import datasets
import logging
import pandas
import torch

In [4]:
pd.set_option('display.max_colwidth', None)

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using pandas version: {pandas.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')

[Using transformers version: 4.26.1]
[Using datasets version: 2.9.0]
[Using pandas version: 1.5.3]
[Using torch version: 1.12.1+cu113]


#### Load GPT-Neo tokenizer 

In [7]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
logger.info(tokenizer)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})


In [8]:
vocab = tokenizer.get_vocab()
len(vocab)

50257

##### Add special tokens 

In [9]:
special_tokens = {
    'bos_token': '<|startoftext|>',
    'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']
}

In [10]:
_ = tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
len(vocab)

50262

In [11]:
logger.info(tokenizer)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']})


#### Load dialogues dataset

In [12]:
df = pd.read_csv('./data/dialogues.csv')
df.head()

Unnamed: 0,dialogue
0,i got high once i was home alone. i smoked a lot of weed and i was really paranoid<>i was feeling rather anxious the other day because i smoked a ton of pot
1,i accidentally stepped on someone's toe at the grocery store and they really looked in pain<>of course! i probably apologized 20 times. i felt so guilty.
2,i am so elated. i am going on a vacation tomorrow.<>i am going to north korea.. i can't wait!
3,"i am nina and i have an apartment in new york<>hi nina , my name is rob . i fix roofs for a living .<>i am twenty one years of age and i love roses as my fave flower .<>that is cool you are twenty one . i enjoy drinking beer when i get home from work<>my small black and white cat is just so playful and loves to mess around<>when i was in highschool i was quarterback for the football team .<>i bet you did well in that sport<>yes . i love eating well done steaks so i am big and muscular .<>wow ! steak and good looking i bet . lol<>and i drive a really nice chevy truck ! i have it lifted since i go off roading<>what else do you do for fun ?<>i enjoy running . i compete in a marathon at least once a year .<>oh how far do you run<>most days i run 5 miles in the morning<>some days i walk from 6 7 miles a day<>where do you like to walk ?<>"
4,"hi i am tom and i am eating my favorite thing , pizza<>hi . pizza sounds good . i am nervous , proposing to my girlfriend tonight .<>oh awesome you should take her to an italian restaurant who does not love that<>so true . we already bought a house and she is 6 months pregnant so i think she will accept .<>wow ! ! ! that is so cool , i better keep trying to watch what i eat maybe i will find someone<>you will . we went to school together i got my college diploma just last week .<>congrats i got mines a few years after moving here i am from east asia<>i would love to visit there . never really traveled , guess we wo not now .<>well its never too late to explore<>our house is near her parents , that is what makes me most nervous about things .<>well that means you will have help when the baby arrives<>so true . anything else interesting you would like to share ?<>"


In [13]:
df.count()

dialogue    121703
dtype: int64

In [14]:
dialogues = df['dialogue'].tolist()
SEP = '<>'

In [15]:
dialogue_lengths = []

for dialogue in dialogues:
    dialogue = dialogue.strip()
    dialogue_lengths.append(len(dialogue))

mean_dialogue_len = np.mean(dialogue_lengths)
logger.info(f'Average dialogue length = {int(mean_dialogue_len)}')

Average dialogue length = 252


In [16]:
cleaned_dialogues = []
MAX_LEN = (mean_dialogue_len * 4)
MIN_TURNS = 2

for dialogue in dialogues:
    dialogue = dialogue.strip()
    turns = dialogue.split(SEP)
    if len(dialogue) <= MAX_LEN and len(turns) > MIN_TURNS:
        cleaned_dialogues.append(turns)

#### Generate Token IDs 

In [17]:
token_ids = []

In [18]:
%%time

for dialogue in tqdm(cleaned_dialogues):
    dialogue_ids = []
    for utterance in dialogue:
        tokens = tokenizer.tokenize(utterance)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        dialogue_ids.append(ids)
    token_ids.append(dialogue_ids)

100%|██████████| 19591/19591 [00:15<00:00, 1231.11it/s]

CPU times: user 15.9 s, sys: 109 ms, total: 16 s
Wall time: 15.9 s





#### Generate Token Type IDs and Labels 

In [19]:
bos_id = vocab['<|startoftext|>']
eos_id = vocab['<|endoftext|>']
speaker_1_id = vocab['<|speaker-1|>']
speaker_2_id = vocab['<|speaker-2|>']
mask = vocab['<|mask|>']

In [20]:
dialogues_with_speaker_ids = []

for dialogue in tqdm(token_ids):
    utterances_with_speaker_ids = []
    for i, utterance in enumerate(dialogue):
        if i%2 == 0:
            utterances_with_speaker_ids.append([speaker_1_id] + utterance)  # Speaker 1: User
        else:
            utterances_with_speaker_ids.append([speaker_2_id] + utterance)  # Speaker 2: Bot
            
    dialogues_with_speaker_ids.append(utterances_with_speaker_ids)

100%|██████████| 19591/19591 [00:00<00:00, 44763.62it/s]


In [21]:
input_ids = []
for dialogue in tqdm(dialogues_with_speaker_ids):
    n = len(dialogue)
    for i in range(2, n+1, 2):
        turn = dialogue[:i]
        input_ids.append([bos_id] + list(chain.from_iterable(turn)) + [eos_id])

100%|██████████| 19591/19591 [00:00<00:00, 51444.44it/s]


##### Generate Token Type IDs

In [22]:
token_type_ids = []

for turn in tqdm(input_ids):
    turn_token_type_ids = []
    type_id = speaker_1_id
    for token in turn:
        if token == speaker_1_id:
            type_id = speaker_1_id
            turn_token_type_ids.append(type_id)
        elif token == speaker_2_id:
            type_id = speaker_2_id
            turn_token_type_ids.append(type_id)
        else:
            turn_token_type_ids.append(type_id) 
            
    token_type_ids.append(turn_token_type_ids)

100%|██████████| 103524/103524 [00:02<00:00, 48638.77it/s]


##### Generate Labels 

In [23]:
def mask_except_reply(turn, speaker_2_id):
    last_index = -1
    for i in range(len(turn) - 1, -1, -1):
        if turn[i] == speaker_2_id:
            last_index = i
            break
    for i in range(last_index):
        turn[i] = mask
    return turn

In [24]:
labels = []

for turn in tqdm(input_ids):
    turn_labels = mask_except_reply(turn, speaker_2_id)
    labels.append(turn_labels)

100%|██████████| 103524/103524 [00:00<00:00, 246525.83it/s]


#### Pad data

In [25]:
input_ids_tensor = []

for input_id_turn in input_ids:
    input_ids_tensor.append(torch.LongTensor(input_id_turn))
input_ids_tensor = torch.nn.utils.rnn.pad_sequence(input_ids_tensor, 
                                                   batch_first=True, 
                                                   padding_value=eos_id)

In [26]:
token_type_ids_tensor = []

for token_type_id_turn in token_type_ids:
    token_type_ids_tensor.append(torch.LongTensor(token_type_id_turn))
token_type_ids_tensor = torch.nn.utils.rnn.pad_sequence(token_type_ids_tensor, 
                                                        batch_first=True, 
                                                        padding_value=eos_id)

In [27]:
labels_tensor = []

for label_turn in labels:
    labels_tensor.append(torch.LongTensor(label_turn))
labels_tensor = torch.nn.utils.rnn.pad_sequence(labels_tensor, 
                                                batch_first=True, 
                                                padding_value=mask)

In [28]:
logger.info(f'Input IDs shape: {input_ids_tensor.shape}')
logger.info(f'Token Type IDs shape: {token_type_ids_tensor.shape}')
logger.info(f'Labels shape: {labels_tensor.shape}')

Input IDs shape: torch.Size([103524, 282])
Token Type IDs shape: torch.Size([103524, 282])
Labels shape: torch.Size([103524, 282])


#### Create HuggingFace dataset and split data

In [29]:
%%time

data_dict = {'input_ids': input_ids_tensor, 
             'token_type_ids': token_type_ids_tensor, 
             'labels': labels_tensor}
hf_dataset = datasets.Dataset.from_dict(data_dict)
logger.info(f'HF dataset: {hf_dataset}')

HF dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'labels'],
    num_rows: 103524
})


CPU times: user 3.01 s, sys: 1.35 s, total: 4.36 s
Wall time: 4.36 s


##### Split dataset 

In [30]:
train_validation_test = hf_dataset.train_test_split(shuffle=True, 
                                                    seed=123, 
                                                    test_size=0.1)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': train_validation_test['test']})
logger.info(f'Data splits: {data_splits}')

Data splits: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 93171
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 10353
    })
})


##### Save data splits to disk

In [31]:
data_splits.save_to_disk('./data/tokenized')

Flattening the indices: 100%|██████████| 94/94 [00:05<00:00, 16.11ba/s]
Flattening the indices: 100%|██████████| 11/11 [00:00<00:00, 19.04ba/s]                         
                                                                                                