In [1]:
using Pkg
pkg"activate ."

[32m[1m Activating[22m[39m environment at `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Project.toml`


In [2]:
pkg"add PyCall" # to run Python code from Julia
pkg"add BenchmarkTools" # to run benchmarks

[32m[1m   Updating[22m[39m registry at `~/.julia/registries/General`


[?25l[2K

[32m[1m   Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`


[?25h

[32m[1m  Resolving[22m[39m package versions...
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Project.toml`
[90m [no changes][39m
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Manifest.toml`
[90m [no changes][39m
[32m[1m  Resolving[22m[39m package versions...
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Project.toml`
[90m [no changes][39m
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Manifest.toml`
[90m [no changes][39m


## Getting Toeknized Examples from Python

In [3]:
using PyCall
py"""
import nlp
import numpy as np
from transformers import BertTokenizer

def extract_sentences():
    dataset = nlp.load_dataset('glue', "sst2")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    results = []
    for sent in dataset['train']['sentence']:
        results.append(tokenizer.tokenize(sent))
    return results
"""

In [4]:
@time sentences = py"extract_sentences"();

 13.991118 seconds (3.77 M allocations: 140.210 MiB, 0.72% gc time)


In [5]:
sentences = vcat(sentences, deepcopy(sentences), deepcopy(sentences), deepcopy(sentences)) # x4 sizes

269396-element Array{Array{String,1},1}:
 ["hide", "new", "secret", "##ions", "from", "the", "parental", "units"]
 ["contains", "no", "wit", ",", "only", "labor", "##ed", "gag", "##s"]
 ["that", "loves", "its", "characters", "and", "communicate", "##s", "something", "rather", "beautiful", "about", "human", "nature"]
 ["remains", "utterly", "satisfied", "to", "remain", "the", "same", "throughout"]
 ["on", "the", "worst", "revenge", "-", "of", "-", "the", "-", "ne", "##rds", "cl", "##iche", "##s", "the", "filmmakers", "could", "dr", "##edge", "up"]
 ["that", "'", "s", "far", "too", "tragic", "to", "merit", "such", "superficial", "treatment"]
 ["demonstrates", "that", "the", "director", "of", "such", "hollywood", "blockbuster", "##s", "as"  …  "small", ",", "personal", "film", "with", "an", "emotional", "wall", "##op", "."]
 ["of", "sa", "##uc", "##y"]
 ["a", "depressed", "fifteen", "-", "year", "-", "old", "'", "s", "suicidal", "poetry"]
 ["are", "more", "deeply", "thought", "through", "

## Add Special Tokens

In [6]:
function add_special_tokens!(sentence)
    pushfirst!(sentence, "[CLS]")
    push!(sentence, "[SEP]")
end

add_special_tokens! (generic function with 1 method)

In [7]:
using BenchmarkTools
@benchmark results = add_special_tokens!.(tmp) setup=(tmp=deepcopy(sentences))

BenchmarkTools.Trial: 
  memory estimate:  53.50 MiB
  allocs estimate:  269398
  --------------
  minimum time:     17.453 ms (0.00% GC)
  median time:      24.665 ms (0.00% GC)
  mean time:        40.665 ms (46.40% GC)
  maximum time:     140.432 ms (85.04% GC)
  --------------
  samples:          11
  evals/sample:     1

## Mark First Pieces

In [8]:
function is_first_piece(arr::Array{String,1})
    return .!startswith.(arr, "##")
end

is_first_piece (generic function with 1 method)

In [9]:
@benchmark results = is_first_piece.(tmp) setup=(tmp=deepcopy(sentences)) # doesn't really need to deepcopy, but just to be safe

BenchmarkTools.Trial: 
  memory estimate:  34.94 MiB
  allocs estimate:  538796
  --------------
  minimum time:     38.018 ms (0.00% GC)
  median time:      42.560 ms (0.00% GC)
  mean time:        47.731 ms (11.38% GC)
  maximum time:     75.288 ms (38.02% GC)
  --------------
  samples:          10
  evals/sample:     1

In [10]:
results = is_first_piece.(sentences)

269396-element Array{BitArray{1},1}:
 [1, 1, 1, 0, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 0, 1, 0]
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 0, 1]
 [1, 1, 0, 0]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1]
 [1, 1, 1, 0, 0, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1]
 ⋮
 [1, 1, 1, 0, 1, 1, 0, 0]
 [1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 0]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1]
 [1, 1, 1]
 [1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1]
 [1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

### Multi-thread

Remember to set the environment variable `JULIA_NUM_THREADS` to the number of physical CPU cores you have, e.g., `JULIA_NUM_THREADS=4`.

In [11]:
Threads.nthreads()

4

In [12]:
@benchmark begin
    results = [Bool[] for _ in 1:length(sentences)]
    Threads.@threads for i in 1:length(sentences)
       results[i] = is_first_piece(sentences[i])
    end
end

BenchmarkTools.Trial: 
  memory estimate:  89.64 MiB
  allocs estimate:  1346499
  --------------
  minimum time:     23.574 ms (0.00% GC)
  median time:      31.006 ms (0.00% GC)
  mean time:        39.748 ms (31.85% GC)
  maximum time:     67.788 ms (53.85% GC)
  --------------
  samples:          126
  evals/sample:     1

In [13]:
# results = [BitArray(undef, length(sentences[i])) for i in 1:length(sentences)]
first_piece_masks = [Bool[] for _ in 1:length(sentences)]
Threads.@threads for i in 1:length(sentences)
   first_piece_masks[i] = is_first_piece(sentences[i])
end
first_piece_masks[1]

8-element Array{Bool,1}:
 1
 1
 1
 0
 1
 1
 1
 1

## Sampling

In [14]:
using StatsBase
function sample_mask_position(first_piece_mask, n=1)
    if sum(first_piece_mask) <= n
        return Int64[]
    end
    return sample(1:length(first_piece_mask), Weights(first_piece_mask), n, replace=false)
end

sample_mask_position (generic function with 2 methods)

In [15]:
@benchmark sample_mask_position.(first_piece_masks)

BenchmarkTools.Trial: 
  memory estimate:  57.52 MiB
  allocs estimate:  777905
  --------------
  minimum time:     60.029 ms (0.00% GC)
  median time:      71.588 ms (12.66% GC)
  mean time:        69.559 ms (9.71% GC)
  maximum time:     88.289 ms (21.28% GC)
  --------------
  samples:          72
  evals/sample:     1

In [16]:
@time masking_points = sample_mask_position.(first_piece_masks)

  0.087545 seconds (788.24 k allocations: 57.962 MiB, 14.62% gc time)


269396-element Array{Array{Int64,1},1}:
 [6]
 [6]
 [4]
 [2]
 [3]
 [7]
 [24]
 [2]
 [7]
 [4]
 [4]
 [15]
 [7]
 ⋮
 [6]
 [1]
 [2]
 [2]
 [4]
 [9]
 [2]
 [2]
 [2]
 [11]
 [3]
 [13]

## Masking

In [17]:
sentences

269396-element Array{Array{String,1},1}:
 ["hide", "new", "secret", "##ions", "from", "the", "parental", "units"]
 ["contains", "no", "wit", ",", "only", "labor", "##ed", "gag", "##s"]
 ["that", "loves", "its", "characters", "and", "communicate", "##s", "something", "rather", "beautiful", "about", "human", "nature"]
 ["remains", "utterly", "satisfied", "to", "remain", "the", "same", "throughout"]
 ["on", "the", "worst", "revenge", "-", "of", "-", "the", "-", "ne", "##rds", "cl", "##iche", "##s", "the", "filmmakers", "could", "dr", "##edge", "up"]
 ["that", "'", "s", "far", "too", "tragic", "to", "merit", "such", "superficial", "treatment"]
 ["demonstrates", "that", "the", "director", "of", "such", "hollywood", "blockbuster", "##s", "as"  …  "small", ",", "personal", "film", "with", "an", "emotional", "wall", "##op", "."]
 ["of", "sa", "##uc", "##y"]
 ["a", "depressed", "fifteen", "-", "year", "-", "old", "'", "s", "suicidal", "poetry"]
 ["are", "more", "deeply", "thought", "through", "

In [18]:
masking_points[1]

1-element Array{Int64,1}:
 6

In [19]:
sentences[1][6]

"the"

In [20]:
function masking(rows::Vector{Vector{String}}, first_piece_masks::Vector{Vector{Bool}}, masking_points::Vector{Vector{Int64}})
    augmented_rows = deepcopy(rows)
    for idx in 1:length(masking_points)
        for pos in masking_points[idx]
            # @assert first_piece_masks[idx][pos] == 1
            augmented_rows[idx][pos] = "[MASK]"
            while pos + 1 <= length(first_piece_masks[idx]) && first_piece_masks[idx][pos + 1] == 0 
                pos += 1
                augmented_rows[idx][pos] = "[MASK]"
            end
        end
    end    
    return augmented_rows
end

masking (generic function with 1 method)

In [21]:
@benchmark deepcopy(sentences)

BenchmarkTools.Trial: 
  memory estimate:  60.78 MiB
  allocs estimate:  280986
  --------------
  minimum time:     407.024 ms (0.00% GC)
  median time:      418.605 ms (1.43% GC)
  mean time:        421.605 ms (2.63% GC)
  maximum time:     445.708 ms (7.95% GC)
  --------------
  samples:          12
  evals/sample:     1

In [22]:
@benchmark masking(sentences, first_piece_masks, masking_points)

BenchmarkTools.Trial: 
  memory estimate:  60.78 MiB
  allocs estimate:  280986
  --------------
  minimum time:     411.065 ms (0.00% GC)
  median time:      423.288 ms (1.39% GC)
  mean time:        426.405 ms (2.58% GC)
  maximum time:     465.249 ms (8.15% GC)
  --------------
  samples:          12
  evals/sample:     1

In [23]:
function masking_wo_copy(rows::Vector{Vector{String}}, first_piece_masks::Vector{Vector{Bool}}, masking_points::Vector{Vector{Int64}})
    for idx in 1:length(masking_points)
        for pos in masking_points[idx]
            # @assert first_piece_masks[idx][pos] == 1
            rows[idx][pos] = "[MASK]"
            while pos + 1 <= length(first_piece_masks[idx]) && first_piece_masks[idx][pos + 1] == 0 
                pos += 1
                rows[idx][pos] = "[MASK]"
            end
        end
    end    
    return rows
end

masking_wo_copy (generic function with 1 method)

In [24]:
@benchmark masking_wo_copy(rows, first_piece_masks, masking_points) setup=(rows=deepcopy(sentences))

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     9.629 ms (0.00% GC)
  median time:      9.746 ms (0.00% GC)
  mean time:        9.754 ms (0.00% GC)
  maximum time:     9.912 ms (0.00% GC)
  --------------
  samples:          12
  evals/sample:     1

In [25]:
masking(sentences, first_piece_masks, masking_points)

269396-element Array{Array{String,1},1}:
 ["hide", "new", "secret", "##ions", "from", "[MASK]", "parental", "units"]
 ["contains", "no", "wit", ",", "only", "[MASK]", "[MASK]", "gag", "##s"]
 ["that", "loves", "its", "[MASK]", "and", "communicate", "##s", "something", "rather", "beautiful", "about", "human", "nature"]
 ["remains", "[MASK]", "satisfied", "to", "remain", "the", "same", "throughout"]
 ["on", "the", "[MASK]", "revenge", "-", "of", "-", "the", "-", "ne", "##rds", "cl", "##iche", "##s", "the", "filmmakers", "could", "dr", "##edge", "up"]
 ["that", "'", "s", "far", "too", "tragic", "[MASK]", "merit", "such", "superficial", "treatment"]
 ["demonstrates", "that", "the", "director", "of", "such", "hollywood", "blockbuster", "##s", "as"  …  "small", ",", "personal", "film", "with", "an", "[MASK]", "wall", "##op", "."]
 ["of", "[MASK]", "[MASK]", "[MASK]"]
 ["a", "depressed", "fifteen", "-", "year", "-", "[MASK]", "'", "s", "suicidal", "poetry"]
 ["are", "more", "deeply", "[MASK]"