## Tokenizer with bert

In [1]:
import numpy as np
import random
import torch
import torch.nn  as nn
from torchtext import data

from transformers import BertTokenizer

### set random seed

In [2]:
# random config
SEED = 2021

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### load transformer pretrained model

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
len(tokenizer.vocab)

30522

### tokenize test

tokenize and lower case the data in a way that is consistent with the pre-trained transformer model

In [5]:
demo_text = 'Hello WORLD ARE yoU OK?'
tokens = tokenizer.tokenize(demo_text)
print(tokens)

['hello', 'world', 'are', 'you', 'ok', '?']


###  numericalize tokens

In [6]:
token_indices = tokenizer.convert_tokens_to_ids(tokens)
print(token_indices)

[7592, 2088, 2024, 2017, 7929, 1029]


**Note** the tokenizer does have a beginning of sequence and end of sequence attributes (bos_token and eos_token) but these are not set and should not be used for this transformer.

In [7]:
bos_token = tokenizer.cls_token # classifier token which is used when doing sequence classification
eos_token = tokenizer.sep_token # The separator token, which is used when building a sequence from multiple sequences
pad_token = tokenizer.pad_token # The token used for padding
unk_token = tokenizer.unk_token # The unknown token

print(bos_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


### get the indices of the special tokens 

In [8]:
# by converting them using the vocabulary
bos_token_idx = tokenizer.convert_tokens_to_ids(bos_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(bos_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

# explicit getting inddex with from tokenizer
bos_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(bos_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100
101 102 0 100


### defined maximum length

In [9]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


### tokenizer and cut

*Note* that our maximum length is 2 less than the actual maximum length. This is because we need to append two tokens to each sequence, one to the start and one to the end.

In [10]:
def tokenize_with_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    
    return tokens

### Define fields

In [11]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_with_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = bos_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

