##### Prerequisites

In [None]:
%%capture 

!pip install torch==1.12.1+cu113
!pip install transformers==4.26.1
!pip install datasets==2.9.0
!pip install pandas==1.5.3

#### Imports 

In [3]:
from transformers import AutoTokenizer
from datasets import DatasetDict
from itertools import chain
from tqdm import tqdm
import pandas as pd
import transformers 
import datasets
import logging
import pandas
import torch

In [4]:
pd.set_option('display.max_colwidth', None)

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using pandas version: {pandas.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')

[Using transformers version: 4.26.1]
[Using datasets version: 2.9.0]
[Using pandas version: 1.5.3]
[Using torch version: 1.12.1+cu113]


#### Load GPT-Neo tokenizer 

In [7]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
logger.info(tokenizer)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})


In [8]:
vocab = tokenizer.get_vocab()
len(vocab)

50257

##### Add special tokens 

In [9]:
special_tokens = {
    'bos_token': '<|startoftext|>',
    'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']
}

In [10]:
_ = tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
len(vocab)

50262

In [11]:
logger.info(tokenizer)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-neo-125M', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']})


#### Load dialogues dataset

In [12]:
df = pd.read_csv('./data/dialogues.csv')
df.head()

Unnamed: 0,dialogue
0,"hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .<>you must be very fast . hunting is one of my favorite hobbies .<>i am ! for my hobby i like to do canning or some whittling .<>i also remodel homes when i am not out bow hunting .<>that is neat . when i was in high school i placed 6th in 100m dash !<>that is awesome . do you have a favorite season or time of year ?<>i do not . but i do have a favorite meat since that is all i eat exclusively .<>what is your favorite meat to eat ?<>i would have to say its prime rib . do you have any favorite foods ?<>i like chicken or macaroni and cheese .<>do you have anything planned for today ? i think i am going to do some canning .<>i am going to watch football . what are you canning ?<>i think i will can some jam . do you also play footfall for fun ?<>if i have time outside of hunting and remodeling homes . which is not much !<>"
1,"hi , how are you doing today ?<>i am spending time with my 4 sisters what are you up to<>wow , four sisters . just watching game of thrones .<>that is a good show i watch that while drinking iced tea<>i agree . what do you do for a living ?<>i am a researcher i am researching the fact that mermaids are real<>interesting . i am a website designer . pretty much spend all my time on the computer .<>that is cool my mom does the same thing<>that is awesome . i have always had a love for technology .<>tell me more about yourself<>i really enjoy free diving , how about you , have any hobbies ?<>i enjoy hanging with my mother she is my best friend<>that is nice . moms are pretty cool too .<>i am also fascinated with mermaids<>"
2,"we all live in a yellow submarine , a yellow submarine . morning !<>hi ! that is a great line for my next stand up .<>lol . i am shy , anything to break the ice , and i am a beatles fan .<>i can tell . i am not , you can see me in some tv shows<>really ? what shows ? i like tv , it makes me forget i do not like my family<>wow , i wish i had a big family . i grew up in a very small town .<>i did too . i do not get along with mine . they have no class .<>just drink some cola with rum and you will forget about them !<>put the lime in the coconut as well . . .<>nah , plain cuba libre , that is what we drank yesterday at the theater .<>i prefer mojitos . watermelon or cucumber .<>those are really yummy too , but not my favorite .<>"
3,"hi ! i work as a gourmet cook .<>i do not like carrots . i throw them away .<>really . but , i can sing pitch perfect .<>i also cook , and i ride my bike to work .<>great ! i had won an award for spelling bee .<>my contacts can see through what you are trying to sell me .<>okay but i was published in new yorker once<>you better not make any spelling mistakes .<>i have not . i can cook any word you want me to<>what is your ethnicity ? i am white , and my hair is brown .<>i am asian and have no hair .<>i love hairless asians . do you like carrots ?<>i love carrots . i eat carrots like a horse .<>are you male or female ?<>i work as a gourmet cook who also has a pitch perfect voice .<>i doubt that very much . you probably like to scream alone .<>"
4,"how are you doing today<>what do you do for career ? i have a ton of hobbies if you are interested !<>i like to watch kids<>i actually play guitar and do a lot of manly things , like welding .<>what do you weld ? houses ?<>everything ! i am actually manly . but i have a secret i am hiding .<>what is your secret that you have<>my parents do not know that i am . . . homosexual .<>how does that feel for you<>makes me secure with my manly hobby skills .<>i bet that it does<>anyway . what do you do ?<>i watch kids for a living<>that is awesome . do you like it ?<>"


In [13]:
df.count()

dialogue    8939
dtype: int64

In [14]:
dialogues = df['dialogue'].tolist()
SEP = '<>'

In [15]:
cleaned_dialogues = []

for dialogue in dialogues:
    dialogue = dialogue.strip()
    turns = dialogue.split(SEP)
    cleaned_dialogues.append(turns)

#### Generate Token IDs 

In [16]:
token_ids = []

In [17]:
%%time

for dialogue in tqdm(cleaned_dialogues):
    dialogue_ids = []
    for utterance in dialogue:
        tokens = tokenizer.tokenize(utterance)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        dialogue_ids.append(ids)
    token_ids.append(dialogue_ids)

100%|██████████| 8939/8939 [00:10<00:00, 858.97it/s]

CPU times: user 10.4 s, sys: 88.7 ms, total: 10.4 s
Wall time: 10.4 s





#### Generate Token Type IDs and Labels 

In [18]:
bos_id = vocab['<|startoftext|>']
eos_id = vocab['<|endoftext|>']
speaker_1_id = vocab['<|speaker-1|>']
speaker_2_id = vocab['<|speaker-2|>']
mask = vocab['<|mask|>']

In [19]:
dialogues_with_speaker_ids = []

for dialogue in tqdm(token_ids):
    utterances_with_speaker_ids = []
    for i, utterance in enumerate(dialogue):
        if i%2 == 0:
            utterances_with_speaker_ids.append([speaker_1_id] + utterance)  # Speaker 1: User
        else:
            utterances_with_speaker_ids.append([speaker_2_id] + utterance)  # Speaker 2: Bot
            
    dialogues_with_speaker_ids.append(utterances_with_speaker_ids)

100%|██████████| 8939/8939 [00:00<00:00, 41975.96it/s]


In [20]:
input_ids = []
for dialogue in tqdm(dialogues_with_speaker_ids):
    n = len(dialogue)
    for i in range(2, n+1, 2):
        turn = dialogue[:i]
        input_ids.append([bos_id] + list(chain.from_iterable(turn)) + [eos_id])

100%|██████████| 8939/8939 [00:00<00:00, 19940.22it/s]


##### Generate Token Type IDs

In [21]:
token_type_ids = []

for turn in tqdm(input_ids):
    turn_token_type_ids = []
    type_id = speaker_1_id
    for token in turn:
        if token == speaker_1_id:
            type_id = speaker_1_id
            turn_token_type_ids.append(type_id)
        elif token == speaker_2_id:
            type_id = speaker_2_id
            turn_token_type_ids.append(type_id)
        else:
            turn_token_type_ids.append(type_id) 
            
    token_type_ids.append(turn_token_type_ids)

100%|██████████| 65719/65719 [00:01<00:00, 44884.18it/s]


##### Generate Labels 

In [22]:
def mask_except_reply(turn, speaker_2_id):
    last_index = -1
    for i in range(len(turn) - 1, -1, -1):
        if turn[i] == speaker_2_id:
            last_index = i
            break
    for i in range(last_index):
        turn[i] = mask
    return turn

In [23]:
labels = []

for turn in tqdm(input_ids):
    turn_labels = mask_except_reply(turn, speaker_2_id)
    labels.append(turn_labels)

100%|██████████| 65719/65719 [00:00<00:00, 206130.45it/s]


#### Pad data

In [24]:
input_ids_tensor = []

for input_id_turn in input_ids:
    input_ids_tensor.append(torch.LongTensor(input_id_turn))
input_ids_tensor = torch.nn.utils.rnn.pad_sequence(input_ids_tensor, batch_first=True, padding_value=eos_id)

In [25]:
token_type_ids_tensor = []

for token_type_id_turn in token_type_ids:
    token_type_ids_tensor.append(torch.LongTensor(token_type_id_turn))
token_type_ids_tensor = torch.nn.utils.rnn.pad_sequence(token_type_ids_tensor, batch_first=True, padding_value=eos_id)

In [26]:
labels_tensor = []

for label_turn in labels:
    labels_tensor.append(torch.LongTensor(label_turn))
labels_tensor = torch.nn.utils.rnn.pad_sequence(labels_tensor, batch_first=True, padding_value=mask)

In [27]:
logger.info(f'Input IDs shape: {input_ids_tensor.shape}')
logger.info(f'Token Type IDs shape: {token_type_ids_tensor.shape}')
logger.info(f'Labels shape: {labels_tensor.shape}')

Input IDs shape: torch.Size([65719, 557])
Token Type IDs shape: torch.Size([65719, 557])
Labels shape: torch.Size([65719, 557])


#### Create HuggingFace dataset and split data

In [28]:
data_dict = {'input_ids': input_ids_tensor, 
             'token_type_ids': token_type_ids_tensor, 
             'labels': labels_tensor}
hf_dataset = datasets.Dataset.from_dict(data_dict)
logger.info(f'HF dataset: {hf_dataset}')

HF dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'labels'],
    num_rows: 65719
})


##### Split dataset 

In [29]:
train_validation_test = hf_dataset.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': train_validation_test['test']})
logger.info(f'Data splits: {data_splits}')

Data splits: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 59147
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'labels'],
        num_rows: 6572
    })
})


##### Save data splits to disk

In [30]:
data_splits.save_to_disk('./data/tokenized')

Flattening the indices: 100%|██████████| 60/60 [00:04<00:00, 13.16ba/s]
Flattening the indices: 100%|██████████| 7/7 [00:00<00:00, 13.95ba/s]                           
                                                                                              