## Tokenize Examples

In [1]:
import nlp
import numpy as np
from transformers import BertTokenizer

def extract_sentences():
    dataset = nlp.load_dataset('glue', "sst2")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    results = []
    for sent in dataset['train']['sentence']:
        results.append(tokenizer.tokenize(sent))
    return results

In [2]:
sentences = extract_sentences()

In [3]:
sentences = sentences * 4
len(sentences)

269396

## Adding Special Tokens

In [4]:
from copy import deepcopy

In [5]:
def add_special_tokens(sentence):
    sentence.insert(0, "[CLS]")
    sentence.append("[SEP]")

In [6]:
%%timeit
tmp = deepcopy(sentences)

325 ms ± 5.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
tmp = deepcopy(sentences)
for sentence in tmp:
    add_special_tokens(sentence)

372 ms ± 9.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
tmp = deepcopy(sentences)

In [9]:
%%time
for sentence in tmp:
    add_special_tokens(sentence)

CPU times: user 38.9 ms, sys: 3.96 ms, total: 42.8 ms
Wall time: 42.3 ms


## Mark First Pieces

In [10]:
def is_first_piece(tokens):
    return [not token.startswith("##") for token in tokens]

In [11]:
is_first_piece(["1", "w", "##e"])

[True, True, False]

In [12]:
%%time
tmp = deepcopy(sentences)

CPU times: user 475 ms, sys: 3.95 ms, total: 479 ms
Wall time: 477 ms


In [13]:
%%timeit
results = [is_first_piece(sent) for sent in tmp]

326 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
first_piece_masks = [is_first_piece(sent) for sent in tmp]
first_piece_masks[0]

[True, True, True, False, True, True, True, True]

## Sampling

In [15]:
def sample(first_piece_masks, n=1):
    results = []
    for mask in first_piece_masks:
        if sum(mask) <= n:
            results.append([])
            continue
        probabilities = np.asarray(mask) / float(sum(mask))
        results.append(np.random.choice(np.arange(len(mask)), size=n, p=probabilities))
    return results

In [16]:
%%timeit
sample(first_piece_masks)

8.2 s ± 30.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%time
masking_points =  sample(first_piece_masks)
masking_points[:10]

CPU times: user 8.24 s, sys: 36 ms, total: 8.28 s
Wall time: 8.27 s


[array([7]),
 array([3]),
 array([2]),
 array([4]),
 array([6]),
 array([7]),
 array([0]),
 array([1]),
 array([1]),
 array([13])]

## Masking

In [18]:
def masking(rows, first_piece_masks, masking_points):
    augmented_rows = deepcopy(rows)
    for idx in range(len(masking_points)):
        assert len(first_piece_masks[idx]) == len(rows[idx])
        for pos in masking_points[idx]:
            assert pos <= len(first_piece_masks[idx]), f"{pos}, {len(first_piece_masks[idx])}"
            augmented_rows[idx][pos] = "[MASK]"
            while pos +1 < len(first_piece_masks[idx]) and first_piece_masks[idx][pos + 1] == 0:
                pos += 1
                augmented_rows[idx][pos] = "[MASK]"
    return augmented_rows        

In [19]:
%%timeit
masking(sentences, first_piece_masks, masking_points)

725 ms ± 4.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
augmented_rows = deepcopy(sentences)

328 ms ± 2.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%time 
augmented_sentences = masking(sentences, first_piece_masks, masking_points)
augmented_sentences[:10]

CPU times: user 724 ms, sys: 4 ms, total: 728 ms
Wall time: 727 ms


[['hide', 'new', 'secret', '##ions', '[MASK]', 'the', '[MASK]', '[MASK]'],
 ['contains',
  '[MASK]',
  'wit',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  'gag',
  '##s'],
 ['that',
  'loves',
  '[MASK]',
  'characters',
  'and',
  'communicate',
  '##s',
  'something',
  'rather',
  'beautiful',
  '[MASK]',
  'human',
  '[MASK]'],
 ['remains',
  'utterly',
  'satisfied',
  'to',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  'throughout'],
 ['on',
  '[MASK]',
  'worst',
  'revenge',
  '-',
  'of',
  '[MASK]',
  'the',
  '-',
  'ne',
  '##rds',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  'the',
  'filmmakers',
  'could',
  '[MASK]',
  '[MASK]',
  'up'],
 ['that',
  "'",
  '[MASK]',
  'far',
  '[MASK]',
  'tragic',
  'to',
  '[MASK]',
  'such',
  '[MASK]',
  'treatment'],
 ['[MASK]',
  'that',
  'the',
  'director',
  'of',
  'such',
  'hollywood',
  '[MASK]',
  '[MASK]',
  'as',
  'patriot',
  'games',
  'can',
  '[MASK]',
  'turn',
  'out',
  'a',
  'small',
  ',',
  'personal',
  'film',
  'wi