##### Prerequisites

In [None]:
%%capture 

!pip install torch==1.12.1+cu113
!pip install transformers==4.26.1
!pip install datasets==2.9.0
!pip install pandas==1.5.3

#### Imports 

In [3]:
from transformers import AutoTokenizer
from datasets import DatasetDict
from itertools import chain
from tqdm import tqdm
import pandas as pd
import transformers 
import datasets
import logging
import pandas
import torch

In [4]:
pd.set_option('display.max_colwidth', None)

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using pandas version: {pandas.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')

[Using transformers version: 4.26.1]
[Using datasets version: 2.9.0]
[Using pandas version: 1.5.3]
[Using torch version: 1.12.1+cu113]


#### Load GPT-Neo tokenizer 

In [7]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
logger.info(tokenizer)

Downloading (…)okenizer_config.json: 100%|██████████| 560/560 [00:00<00:00, 78.9kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.01k/1.01k [00:00<00:00, 143kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 83.0MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 52.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 357/357 [00:00<00:00, 159kB/s]
GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})


In [8]:
vocab = tokenizer.get_vocab()
len(vocab)

50257

##### Add special tokens 

In [9]:
special_tokens = {
    'bos_token': '<|startoftext|>',
    'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']
}

In [10]:
_ = tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
len(vocab)

50262

In [11]:
logger.info(tokenizer)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']})


#### Load dialogues dataset

In [12]:
df = pd.read_csv('./data/dialogues.csv')
df.head()

Unnamed: 0,dialogue
0,"Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.<>Well, I thought we'd start with pronunciation, if that's okay with you.<>Not the hacking and gagging and spitting part. Please.<>Okay... then how 'bout we try out some French cuisine. Saturday? Night?"
1,You're asking me out. That's so cute. What's your name again?<>Forget it.
2,"No, no, it's my fault -- we didn't have a proper introduction ---<>Cameron.<>The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does.<>Seems like she could get a date easy enough..."
3,"Why?<>Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something.<>That's a shame."
4,"Gosh, if only we could find Kat a boyfriend...<>Let me see what I can do."


In [13]:
df.count()

dialogue    83097
dtype: int64

In [14]:
dialogues = df['dialogue'].tolist()
SEP = '<>'

In [15]:
cleaned_dialogues = []

for dialogue in dialogues:
    dialogue = dialogue.strip()
    turns = dialogue.split(SEP)
    cleaned_dialogues.append(turns)

#### Generate Token IDs 

In [16]:
token_ids = []

In [17]:
%%time

for dialogue in tqdm(cleaned_dialogues):
    dialogue_ids = []
    for utterance in dialogue:
        tokens = tokenizer.tokenize(utterance)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        dialogue_ids.append(ids)
    token_ids.append(dialogue_ids)

100%|██████████| 83097/83097 [00:25<00:00, 3304.30it/s]

CPU times: user 25 s, sys: 168 ms, total: 25.2 s
Wall time: 25.2 s





#### Generate Token Type IDs and Labels 

In [18]:
bos_id = vocab['<|startoftext|>']
eos_id = vocab['<|endoftext|>']
speaker_1_id = vocab['<|speaker-1|>']
speaker_2_id = vocab['<|speaker-2|>']
mask = vocab['<|mask|>']

In [19]:
dialogues_with_speaker_ids = []

for dialogue in tqdm(token_ids):
    utterances_with_speaker_ids = []
    for i, utterance in enumerate(dialogue):
        if i%2 == 0:
            utterances_with_speaker_ids.append([speaker_1_id] + utterance)  # Speaker 1: User
        else:
            utterances_with_speaker_ids.append([speaker_2_id] + utterance)  # Speaker 2: Bot
            
    dialogues_with_speaker_ids.append(utterances_with_speaker_ids)

100%|██████████| 83097/83097 [00:00<00:00, 129592.49it/s]


In [20]:
input_ids = []
for dialogue in tqdm(dialogues_with_speaker_ids):
    n = len(dialogue)
    for i in range(2, n+1, 2):
        turn = dialogue[:i]
        input_ids.append([bos_id] + list(chain.from_iterable(turn)) + [eos_id])

100%|██████████| 83097/83097 [00:00<00:00, 121147.62it/s]


##### Generate Token Type IDs

In [21]:
token_type_ids = []

for turn in tqdm(input_ids):
    turn_token_type_ids = []
    type_id = speaker_1_id
    for token in turn:
        if token == speaker_1_id:
            type_id = speaker_1_id
            turn_token_type_ids.append(type_id)
        elif token == speaker_2_id:
            type_id = speaker_2_id
            turn_token_type_ids.append(type_id)
        else:
            turn_token_type_ids.append(type_id) 
            
    token_type_ids.append(turn_token_type_ids)

100%|██████████| 138135/138135 [00:01<00:00, 90155.47it/s]


##### Generate Labels 

In [22]:
def mask_except_reply(turn, speaker_2_id):
    last_index = -1
    for i in range(len(turn) - 1, -1, -1):
        if turn[i] == speaker_2_id:
            last_index = i
            break
    for i in range(last_index):
        turn[i] = mask
    return turn

In [23]:
labels = []

for turn in tqdm(input_ids):
    turn_labels = mask_except_reply(turn, speaker_2_id)
    labels.append(turn_labels)

100%|██████████| 138135/138135 [00:00<00:00, 318535.03it/s]


#### Pad data

In [24]:
input_ids_tensor = []

for input_id_turn in input_ids:
    input_ids_tensor.append(torch.LongTensor(input_id_turn))
input_ids_tensor = torch.nn.utils.rnn.pad_sequence(input_ids_tensor, batch_first=True, padding_value=eos_id)

In [25]:
token_type_ids_tensor = []

for token_type_id_turn in token_type_ids:
    token_type_ids_tensor.append(torch.LongTensor(token_type_id_turn))
token_type_ids_tensor = torch.nn.utils.rnn.pad_sequence(token_type_ids_tensor, batch_first=True, padding_value=eos_id)

In [26]:
labels_tensor = []

for label_turn in labels:
    labels_tensor.append(torch.LongTensor(label_turn))
labels_tensor = torch.nn.utils.rnn.pad_sequence(labels_tensor, batch_first=True, padding_value=mask)

In [27]:
logger.info(f'Input IDs shape: {input_ids_tensor.shape}')
logger.info(f'Token Type IDs shape: {token_type_ids_tensor.shape}')
logger.info(f'Labels shape: {labels_tensor.shape}')

Input IDs shape: torch.Size([138135, 1579])
Token Type IDs shape: torch.Size([138135, 1579])
Labels shape: torch.Size([138135, 1579])


#### Create HuggingFace dataset and split data

In [28]:
data_dict = {'input_ids': input_ids_tensor, 
             'token_type_ids': token_type_ids_tensor, 
             'labels': labels_tensor}
hf_dataset = datasets.Dataset.from_dict(data_dict)
logger.info(f'HF dataset: {hf_dataset}')

HF dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'labels'],
    num_rows: 138135
})


##### Split dataset 

In [29]:
train_validation_test = hf_dataset.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': train_validation_test['test']})
logger.info(f'Data splits: {data_splits}')

Data splits: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 124321
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 13814
    })
})


##### Save data splits to disk

In [30]:
data_splits.save_to_disk('./data/tokenized')

Flattening the indices: 100%|██████████| 125/125 [00:12<00:00, 10.32ba/s]
Flattening the indices: 100%|██████████| 14/14 [00:01<00:00,  9.95ba/s]                          
                                                                                                