## Tokenizer with bert

In [1]:
import numpy as np
import random
import torch
import torch.nn  as nn
from torchtext import data

from transformers import BertTokenizer

### set random seed

In [2]:
# random config
SEED = 2021

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### load transformer pretrained model

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
len(tokenizer.vocab)

30522

### tokenize test

tokenize and lower case the data in a way that is consistent with the pre-trained transformer model

In [5]:
demo_text = 'Hello WORLD ARE yoU OK?'
tokens = tokenizer.tokenize(demo_text)
print(tokens)

['hello', 'world', 'are', 'you', 'ok', '?']


###  numericalize tokens

In [6]:
token_indices = tokenizer.convert_tokens_to_ids(tokens)
print(token_indices)

[7592, 2088, 2024, 2017, 7929, 1029]


**Note** the tokenizer does have a beginning of sequence and end of sequence attributes (bos_token and eos_token) but these are not set and should not be used for this transformer.

In [7]:
bos_token = tokenizer.cls_token # classifier token which is used when doing sequence classification
eos_token = tokenizer.sep_token # The separator token, which is used when building a sequence from multiple sequences
pad_token = tokenizer.pad_token # The token used for padding
unk_token = tokenizer.unk_token # The unknown token

print(bos_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


### get the indices of the special tokens 

In [8]:
# by converting them using the vocabulary
bos_token_idx = tokenizer.convert_tokens_to_ids(bos_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(bos_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

# explicit getting inddex with from tokenizer
bos_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(bos_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100
101 102 0 100


### defined maximum length

In [9]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


### tokenizer and cut

*Note* that our maximum length is 2 less than the actual maximum length. This is because we need to append two tokens to each sequence, one to the start and one to the end.

In [10]:
def tokenize_with_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    
    return tokens

### Define fields

In [11]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_with_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = bos_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)



### tokenized 

In [12]:
from torchtext import data, datasets

# -----------------get train, val and test data--------------------
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root='../Dataset/IMDB')

print(train_data.fileds)
print(train_data.examples[0])

train_data, val_data = train_data.split(random_state = random.seed(SEED))

print('Number of train data {}'.format(len(train_data)))
print('Number of val data {}'.format(len(val_data)))
print('Number of val data {}'.format(len(test_data)))



<generator object Dataset.__getattr__ at 0x7feb6d6a37d8>
<torchtext.data.example.Example object at 0x7feb6cfc00f0>
Number of train data 17500
Number of val data 7500
Number of val data 25000


### observing data

check an example and ensure that the text has already been numericalized

In [13]:
print(vars(train_data.examples[0]))

{'text': [2025, 2172, 2000, 2360, 2006, 2023, 2028, 1012, 1037, 5436, 2017, 2064, 3492, 2172, 25039, 1010, 1999, 1996, 2034, 2184, 2781, 1012, 2498, 15241, 3308, 2007, 2023, 2143, 1010, 2200, 2210, 2895, 2005, 2019, 2895, 2143, 1012, 2045, 2001, 1037, 3382, 2000, 8849, 1996, 3494, 6699, 5681, 1012, 3251, 2019, 2895, 2143, 2003, 1996, 2157, 6907, 2000, 2079, 2008, 2007, 1010, 1045, 1005, 1049, 2145, 6151, 8586, 14097, 1012, 17515, 2001, 2028, 1997, 1996, 25551, 3152, 2000, 3422, 2302, 3228, 2440, 3086, 2000, 1010, 2004, 2009, 2018, 2210, 21438, 1998, 1037, 19647, 5436, 1012, 1045, 2001, 2763, 5905, 1997, 2008, 1010, 2061, 2007, 1037, 2117, 3422, 2030, 2007, 6151, 12848, 14097, 3086, 2009, 2089, 2022, 2488, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1018, 1013, 2184, 1006, 2021, 1996, 2190, 1997, 2026, 1018, 2041, 1997, 2184, 1005, 1055, 1007], 'label': 'neg'}


 transform these indexes back into readable tokens

In [14]:
print(tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text']))

['eddie', 'murphy', 'really', 'made', 'me', 'laugh', 'my', 'ass', 'off', 'on', 'this', 'hbo', 'stand', 'up', 'comedy', 'show', '.', 'i', 'love', 'his', 'impressions', 'of', 'mr', '.', 't', ',', 'ed', 'norton', 'and', 'ralph', 'cr', '##am', '##den', 'of', '"', 'the', 'honeymoon', '##ers', '"', ',', 'elvis', 'presley', ',', 'and', 'michael', 'jackson', 'too', '.', 'the', 'ice', 'cream', 'man', ',', 'goo', '##ny', 'goo', 'goo', ',', 'is', 'also', 'funny', '.', 'i', 'saw', 'this', 'for', 'the', 'first', 'time', 'when', 'it', 'came', 'out', 'in', '1984', '.', 'i', 'laughed', 'so', 'hard', ',', 'i', 'almost', 'fell', 'off', 'my', 'chair', '.', 'i', 'still', 'think', 'this', 'is', 'very', 'funny', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'eddie', 'murphy', ',', 'when', 'he', 'was', 'on', '"', 'saturday', 'night', 'live', '"', ',', 'made', 'me', 'laugh', 'so', 'hard', ',', 'he', 'is', 'one', 'of', 'the', 'best', 'people', 'to', 'come', 'out', 'of', '"', 'saturday', 'night', 'live', '"', 

In [15]:
### build the vocabulary for the labels

In [16]:
LABEL.build_vocab(train_data)

print(LABEL.vocab.stoi)

defaultdict(None, {'pos': 0, 'neg': 1})
