In [1]:
using Pkg
pkg"activate ."

[32m[1m Activating[22m[39m environment at `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Project.toml`


In [2]:
pkg"add PyCall" # to run Python code from Julia
pkg"add BenchmarkTools" # to run benchmarks

[32m[1m   Updating[22m[39m registry at `~/.julia/registries/General`


[?25l[2K

[32m[1m   Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`


[?25h

[32m[1m  Resolving[22m[39m package versions...
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Project.toml`
[90m [no changes][39m
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Manifest.toml`
[90m [no changes][39m
[32m[1m  Resolving[22m[39m package versions...
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Project.toml`
[90m [no changes][39m
[32m[1m   Updating[22m[39m `/mnt/SSD_Data/active_projects/transformer_to_lstm/notebooks/Manifest.toml`
[90m [no changes][39m


## Getting Toeknized Examples from Python

In [3]:
using PyCall
py"""
import nlp
import numpy as np
from transformers import BertTokenizer

def extract_sentences():
    dataset = nlp.load_dataset('glue', "sst2")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    results = []
    for sent in dataset['train']['sentence']:
        results.append(tokenizer.tokenize(sent))
    return results
"""

In [4]:
@time sentences = py"extract_sentences"();

 14.568810 seconds (3.77 M allocations: 140.210 MiB, 0.79% gc time)


In [5]:
sentences = vcat(sentences, deepcopy(sentences), deepcopy(sentences), deepcopy(sentences)) # x4 sizes

269396-element Array{Array{String,1},1}:
 ["hide", "new", "secret", "##ions", "from", "the", "parental", "units"]
 ["contains", "no", "wit", ",", "only", "labor", "##ed", "gag", "##s"]
 ["that", "loves", "its", "characters", "and", "communicate", "##s", "something", "rather", "beautiful", "about", "human", "nature"]
 ["remains", "utterly", "satisfied", "to", "remain", "the", "same", "throughout"]
 ["on", "the", "worst", "revenge", "-", "of", "-", "the", "-", "ne", "##rds", "cl", "##iche", "##s", "the", "filmmakers", "could", "dr", "##edge", "up"]
 ["that", "'", "s", "far", "too", "tragic", "to", "merit", "such", "superficial", "treatment"]
 ["demonstrates", "that", "the", "director", "of", "such", "hollywood", "blockbuster", "##s", "as"  …  "small", ",", "personal", "film", "with", "an", "emotional", "wall", "##op", "."]
 ["of", "sa", "##uc", "##y"]
 ["a", "depressed", "fifteen", "-", "year", "-", "old", "'", "s", "suicidal", "poetry"]
 ["are", "more", "deeply", "thought", "through", "

## Add Special Tokens

In [6]:
function add_special_tokens!(sentence)
    pushfirst!(sentence, "[CLS]")
    push!(sentence, "[SEP]")
end

add_special_tokens! (generic function with 1 method)

In [7]:
using BenchmarkTools
@benchmark results = add_special_tokens!.(tmp) setup=(tmp=deepcopy(sentences))

BenchmarkTools.Trial: 
  memory estimate:  53.50 MiB
  allocs estimate:  269398
  --------------
  minimum time:     17.617 ms (0.00% GC)
  median time:      24.211 ms (0.00% GC)
  mean time:        43.134 ms (48.77% GC)
  maximum time:     142.761 ms (84.46% GC)
  --------------
  samples:          11
  evals/sample:     1

## Mark First Pieces

In [8]:
function is_first_piece(arr::Array{String,1})
    return .!startswith.(arr, "##")
end

is_first_piece (generic function with 1 method)

In [9]:
@benchmark results = is_first_piece.(tmp) setup=(tmp=deepcopy(sentences)) # doesn't really need to deepcopy, but just to be safe

BenchmarkTools.Trial: 
  memory estimate:  34.94 MiB
  allocs estimate:  538796
  --------------
  minimum time:     31.381 ms (0.00% GC)
  median time:      31.952 ms (0.00% GC)
  mean time:        39.708 ms (11.95% GC)
  maximum time:     67.098 ms (40.11% GC)
  --------------
  samples:          11
  evals/sample:     1

In [10]:
results = is_first_piece.(sentences)

269396-element Array{BitArray{1},1}:
 [1, 1, 1, 0, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 0, 1, 0]
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 0, 1]
 [1, 1, 0, 0]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1]
 [1, 1, 1, 0, 0, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1]
 ⋮
 [1, 1, 1, 0, 1, 1, 0, 0]
 [1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 0]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1, 1]
 [1, 1, 1]
 [1, 1, 1, 1, 1]
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 [1, 1, 1]
 [1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

### Multi-thread

In [11]:
Threads.nthreads()

1

In [12]:
@benchmark begin
    results = [Bool[] for _ in 1:length(sentences)]
    Threads.@threads for i in 1:length(sentences)
       results[i] = is_first_piece(sentences[i])
    end
end

BenchmarkTools.Trial: 
  memory estimate:  89.64 MiB
  allocs estimate:  1346484
  --------------
  minimum time:     63.998 ms (11.63% GC)
  median time:      68.953 ms (15.24% GC)
  mean time:        70.306 ms (16.99% GC)
  maximum time:     79.396 ms (24.27% GC)
  --------------
  samples:          72
  evals/sample:     1

In [13]:
# results = [BitArray(undef, length(sentences[i])) for i in 1:length(sentences)]
first_piece_masks = [Bool[] for _ in 1:length(sentences)]
Threads.@threads for i in 1:length(sentences)
   first_piece_masks[i] = is_first_piece(sentences[i])
end
first_piece_masks[1]

8-element Array{Bool,1}:
 1
 1
 1
 0
 1
 1
 1
 1

## Sampling

In [14]:
using StatsBase
function sample_mask_position(first_piece_mask, n=1)
    if sum(first_piece_mask) <= n
        return Int64[]
    end
    return sample(1:length(first_piece_mask), Weights(first_piece_mask), n, replace=false)
end

sample_mask_position (generic function with 2 methods)

In [15]:
@benchmark sample_mask_position.(first_piece_masks)

BenchmarkTools.Trial: 
  memory estimate:  57.52 MiB
  allocs estimate:  777905
  --------------
  minimum time:     61.041 ms (0.00% GC)
  median time:      75.644 ms (14.70% GC)
  mean time:        72.452 ms (11.12% GC)
  maximum time:     93.966 ms (24.14% GC)
  --------------
  samples:          69
  evals/sample:     1

In [16]:
@time masking_points = sample_mask_position.(first_piece_masks)

  0.072058 seconds (788.24 k allocations: 57.962 MiB)


269396-element Array{Array{Int64,1},1}:
 [5]
 [1]
 [13]
 [8]
 [8]
 [6]
 [11]
 [1]
 [1]
 [8]
 [4]
 [20]
 [2]
 ⋮
 [3]
 [1]
 [2]
 [3]
 [8]
 [6]
 [4]
 [1]
 [4]
 [8]
 [2]
 [19]

## Masking

In [17]:
sentences

269396-element Array{Array{String,1},1}:
 ["hide", "new", "secret", "##ions", "from", "the", "parental", "units"]
 ["contains", "no", "wit", ",", "only", "labor", "##ed", "gag", "##s"]
 ["that", "loves", "its", "characters", "and", "communicate", "##s", "something", "rather", "beautiful", "about", "human", "nature"]
 ["remains", "utterly", "satisfied", "to", "remain", "the", "same", "throughout"]
 ["on", "the", "worst", "revenge", "-", "of", "-", "the", "-", "ne", "##rds", "cl", "##iche", "##s", "the", "filmmakers", "could", "dr", "##edge", "up"]
 ["that", "'", "s", "far", "too", "tragic", "to", "merit", "such", "superficial", "treatment"]
 ["demonstrates", "that", "the", "director", "of", "such", "hollywood", "blockbuster", "##s", "as"  …  "small", ",", "personal", "film", "with", "an", "emotional", "wall", "##op", "."]
 ["of", "sa", "##uc", "##y"]
 ["a", "depressed", "fifteen", "-", "year", "-", "old", "'", "s", "suicidal", "poetry"]
 ["are", "more", "deeply", "thought", "through", "

In [18]:
masking_points[1]

1-element Array{Int64,1}:
 5

In [19]:
sentences[1][6]

"the"

In [20]:
function masking(rows::Vector{Vector{String}}, first_piece_masks::Vector{Vector{Bool}}, masking_points::Vector{Vector{Int64}})
    augmented_rows = deepcopy(rows)
    for idx in 1:length(masking_points)
        for pos in masking_points[idx]
            # @assert first_piece_masks[idx][pos] == 1
            augmented_rows[idx][pos] = "[MASK]"
            while pos + 1 <= length(first_piece_masks[idx]) && first_piece_masks[idx][pos + 1] == 0 
                pos += 1
                augmented_rows[idx][pos] = "[MASK]"
            end
        end
    end    
    return augmented_rows
end

masking (generic function with 1 method)

In [21]:
@benchmark deepcopy(sentences)

BenchmarkTools.Trial: 
  memory estimate:  60.78 MiB
  allocs estimate:  280986
  --------------
  minimum time:     404.801 ms (0.00% GC)
  median time:      417.494 ms (1.17% GC)
  mean time:        419.381 ms (2.12% GC)
  maximum time:     436.365 ms (6.30% GC)
  --------------
  samples:          12
  evals/sample:     1

In [22]:
@benchmark masking(sentences, first_piece_masks, masking_points)

BenchmarkTools.Trial: 
  memory estimate:  60.78 MiB
  allocs estimate:  280986
  --------------
  minimum time:     411.296 ms (0.00% GC)
  median time:      427.221 ms (1.14% GC)
  mean time:        426.564 ms (2.17% GC)
  maximum time:     451.092 ms (6.58% GC)
  --------------
  samples:          12
  evals/sample:     1

In [23]:
function masking_wo_copy(rows::Vector{Vector{String}}, first_piece_masks::Vector{Vector{Bool}}, masking_points::Vector{Vector{Int64}})
    for idx in 1:length(masking_points)
        for pos in masking_points[idx]
            # @assert first_piece_masks[idx][pos] == 1
            rows[idx][pos] = "[MASK]"
            while pos + 1 <= length(first_piece_masks[idx]) && first_piece_masks[idx][pos + 1] == 0 
                pos += 1
                rows[idx][pos] = "[MASK]"
            end
        end
    end    
    return rows
end

masking_wo_copy (generic function with 1 method)

In [24]:
@benchmark masking_wo_copy(rows, first_piece_masks, masking_points) setup=(rows=deepcopy(sentences))

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     9.040 ms (0.00% GC)
  median time:      9.216 ms (0.00% GC)
  mean time:        9.230 ms (0.00% GC)
  maximum time:     9.436 ms (0.00% GC)
  --------------
  samples:          12
  evals/sample:     1

In [25]:
masking(sentences, first_piece_masks, masking_points)

269396-element Array{Array{String,1},1}:
 ["hide", "new", "secret", "##ions", "[MASK]", "the", "parental", "units"]
 ["[MASK]", "no", "wit", ",", "only", "labor", "##ed", "gag", "##s"]
 ["that", "loves", "its", "characters", "and", "communicate", "##s", "something", "rather", "beautiful", "about", "human", "[MASK]"]
 ["remains", "utterly", "satisfied", "to", "remain", "the", "same", "[MASK]"]
 ["on", "the", "worst", "revenge", "-", "of", "-", "[MASK]", "-", "ne", "##rds", "cl", "##iche", "##s", "the", "filmmakers", "could", "dr", "##edge", "up"]
 ["that", "'", "s", "far", "too", "[MASK]", "to", "merit", "such", "superficial", "treatment"]
 ["demonstrates", "that", "the", "director", "of", "such", "hollywood", "blockbuster", "##s", "as"  …  "small", ",", "personal", "film", "with", "an", "emotional", "wall", "##op", "."]
 ["[MASK]", "sa", "##uc", "##y"]
 ["[MASK]", "depressed", "fifteen", "-", "year", "-", "old", "'", "s", "suicidal", "poetry"]
 ["are", "more", "deeply", "thought", "thr