# Attention-based Neural Machine Translation

**Reference:** Luong, Thang, Hieu Pham and Christopher D. Manning. "Effective Approaches to Attention-based Neural Machine Translation." In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, pp. 1412-1421. 2015.

* https://www.aclweb.org/anthology/D15-1166/ (main paper reference)
* https://arxiv.org/abs/1508.04025 (alternative paper url)
* https://github.com/tensorflow/nmt (main code reference)
* https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention (alternative code reference)
* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py:2449,2103 (attention implementation)

# Neural Machine Translation

**Reference:** Sutskever, Ilya, Oriol Vinyals, and Quoc V. Le. "Sequence to sequence learning with neural networks." In Advances in neural information processing systems, pp. 3104-3112. 2014. ([Paper](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks), [Sample code](https://github.com/tensorflow/nmt))

This Assignment has been done by

<h2> Abdurrahman Beyaz</h2>
00684383 

[Email](aalabrash18@ku.edu.tr) 

<h2> Ahmed Masry </h2>
0061868

[Email](amasry17@ku.edu.tr )


[To Download model attn-1515942959669160.jld2](https://drive.google.com/uc?id=1dW8LTvcoXc2vtLU6YogAe3goJYn0L_gk&export=download) with batch_size=16 and maxlength=50 Normal Distribution BLEU = 14.44 

[To Download model attn-1920283433023258.jld2](https://drive.google.com/uc?id=1Qz-6HDnxVIGpQEPJSfcS5C3DjdQU5b3U&export=download) with batch_size=64 and maxlength=50 Xaiver Distribution BLEU = 16 

In [1]:
# using Pkg; for p in ("Knet", "Test", "Printf", "LinearAlgebra", "Random", "CuArrays", "IterTools"); haskey(Pkg.installed(),p) || Pkg.add(p); end

In [1]:
using Knet, Test, Base.Iterators, Printf, LinearAlgebra, Random, CuArrays, IterTools

## Code and data from previous projects

Please copy or include the following types and related functions from previous projects:
`Vocab`, `TextReader`, `MTData`, `Embed`, `Linear`, `mask!`, `loss`, `int2str`,
`bleu`.

In [2]:
struct Vocab
    w2i::Dict{String,Int}
    i2w::Vector{String}
    unk::Int
    eos::Int
    tokenizer
end

function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    word_count = Dict{String,Int}()
    w2i = Dict{String,Int}()
    i2w = Vector{String}()
    int_unk = get!(w2i, unk, 1+length(w2i))
    int_eos = get!(w2i, eos, 1+length(w2i))
    for line in eachline(file)
        line = tokenizer(line)
        for word in line
            if haskey(word_count, word)
                word_count[word] += 1
            else
                word_count[word] = 1
            end
        end
    end
    word_count = collect(word_count)
    sort!(word_count, rev=true, by=x->x[2])
    # constructing w2i
    for pair in word_count
        if pair[2] >= mincount
            get!(w2i, pair[1], 1+length(w2i))
            if length(w2i) >= vocabsize
                break
            end
        end
    end
    w2i_array = collect(w2i)
    sort!(w2i_array, by=x->x[2])
    for pair in w2i_array
        push!(i2w, pair[1])
    end
    return Vocab(w2i, i2w, int_unk, int_eos, tokenizer)
end

Vocab

In [3]:
struct TextReader
    file::String
    vocab::Vocab
end

function Base.iterate(r::TextReader, s=nothing)
    # Your code here
    s ==nothing ? file = open(r.file) : file =s
    
    if eof(file) == true
        close(file)
        return nothing
    end
    line = readline(file)
    text = r.vocab.tokenizer(line)
    arr = [get(r.vocab.w2i,word,r.vocab.w2i["<unk>"]) for word in text ]
    return (arr, file)
end

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

struct Embed; w; end

function Embed(vocabsize::Int, embedsize::Int)
    # Your code here
    Embed(param(embedsize, vocabsize, atype = KnetArray{Float32}))
end

function (l::Embed)(x)
    # Your code here
    l.w[:,x]
end

struct Linear; w; b; end

function Linear(inputsize::Int, outputsize::Int)
    # Your code here
    Linear(param(outputsize, inputsize, atype = KnetArray{Float32}), param0(outputsize, atype = KnetArray{Float32}))
end

function (l::Linear)(x)
    # Your code here
    l.w*x .+ l.b
end

function mask!(a,pad)
    # Your code here
    #b = deepcopy(a)
    for k in 1:size(a, 1)
        if a[k , size(a, 2)]!= pad
            continue
        end
        
        indices = []
        for i in 1:size(a[k, :], 1)
            if a[k, i] == pad
                push!(indices, i)
            end
        end
        indices = reverse(indices)
        for j in 1:size(indices, 1)-1
            if indices[j] == indices[j+1] + 1
                a[k, indices[j]] = 0
            else
                break
            end
        end
    end
    a
end

mask! (generic function with 1 method)

In [4]:
struct MTData
    src::TextReader        # reader for source language data
    tgt::TextReader        # reader for target language data
    batchsize::Int         # desired batch size
    maxlength::Int         # skip if source sentence above maxlength
    batchmajor::Bool       # batch dims (B,T) if batchmajor=false (default) or (T,B) if true.
    bucketwidth::Int       # batch sentences with length within bucketwidth of each other
    buckets::Vector        # sentences collected in separate arrays called buckets for each length range
    batchmaker::Function   # function that turns a bucket into a batch.
end

function MTData(src::TextReader, tgt::TextReader; batchmaker = arraybatch, batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 10, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    MTData(src, tgt, batchsize, maxlength, batchmajor, bucketwidth, buckets, batchmaker)
end

Base.IteratorSize(::Type{MTData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{MTData}) = Base.HasEltype()
Base.eltype(::Type{MTData}) = NTuple{2}

In [5]:
function Base.iterate(d::MTData, state=nothing)
    # Your code here
    
    batch = nothing
    state = state
    if state == nothing
        for i in 1:length(d.buckets)
           d.buckets[i] = [] 
        end
    end
    
    while batch == nothing
        #print("W")
        if state == "index"
            #println("Hello. It's me.")
            found = false
            for i in 1:length(d.buckets)
                buck = d.buckets[i]
                if length(buck) != 0
                    #println(length(buck))
                    found = true
                    batch = d.batchmaker(d, buck) 
                    d.buckets[i] = []
                    break
                end
            end
            
            if found
                return (batch, "index") 
            else
                #print("Nothing")
                return nothing
            end
        end
        
        state_src, state_tgt = state==nothing ? (nothing, nothing) : state
        src_pair = iterate(d.src, state_src)
        tgt_pair = iterate(d.tgt, state_tgt)
        
        if src_pair == nothing
            #println("Index reached")
            state = "index"
            continue
        end

        src_sentence, state_src = src_pair
        tgt_sentence, state_tgt = tgt_pair
        
        if length(src_sentence) > d.maxlength
            state = (state_src, state_tgt)
            continue
        end
        if length(src_sentence) > length(d.buckets)*d.bucketwidth
            last_bucket = d.buckets[length(d.buckets)]
            push!(last_bucket, (src_sentence, tgt_sentence))
            if length(last_bucket) == d.batch_size
               batch = d.batchmaker(d, last_bucket)
               d.buckets[length(d.buckets)] = []
            end
        else 
            for i in 1:length(d.buckets)
               if (length(src_sentence) >= ((i-1)*d.bucketwidth+1)) & (length(src_sentence) <= (i*d.bucketwidth))
                   push!(d.buckets[i], (src_sentence, tgt_sentence))
                    if length(d.buckets[i]) == d.batchsize
                        #println("I am here")
                        batch = d.batchmaker(d, d.buckets[i])
                        d.buckets[i] = []
                    end
                    break
                end
            end   
        end
        state = (state_src, state_tgt)
    end
    #print("Bye")
    return (batch, state)
end


In [6]:
function arraybatch(d::MTData, bucket)
    # Your code here
    #println("Bucket: ", length(bucket), ", ", length(bucket[1][1]), ", ", length(bucket[1][2]))
    srclength = 0
    tgtlength = 0
    
    x = []
    y = []
    
    for pair in bucket
        src_sentence, tgt_sentence = pair
        if length(src_sentence) > srclength
           srclength = length(src_sentence)
        end
        if length(tgt_sentence) > tgtlength
           tgtlength = length(tgt_sentence)
        end
    end
    
    for pair in bucket
        src_sentence, tgt_sentence = pair
        src_sen = []
        tgt_sen = []
                
        #Target part. 
        tgt_eos = d.tgt.vocab.eos
        push!(tgt_sen, tgt_eos)
        for w in tgt_sentence
            push!(tgt_sen, w) 
        end
        while length(tgt_sen) != (tgtlength + 2)
            push!(tgt_sen, tgt_eos)
        end
        
        #Source part. 
        src_eos = d.src.vocab.eos
        eos_num = srclength - length(src_sentence)
        i = 0
        while i!= eos_num
            push!(src_sen, src_eos)
            i += 1
        end
        for w in src_sentence
            push!(src_sen, w) 
        end        
        push!(x, src_sen)
        push!(y, tgt_sen)
    end
    #println(size(hcat(x...)))
    #println(length(hcat(x...)))
    #println(length(hcat(x...)[1]))
    return Array(transpose(hcat(x...))), Array(transpose(hcat(y...)))
end

arraybatch (generic function with 1 method)

In [7]:
function loss(model, data; average=true)
    loss,insts = 0,0
    loss_sum,loss_total = 0,0
    for (x, y) in data
        loss, insts = model(x, y, average=false)
        loss_total += insts
        loss_sum += loss
    end
    average ? (return loss_sum/loss_total) : (return loss_sum, loss_total)
end

loss (generic function with 1 method)

In [8]:
# Utility to convert int arrays to sentence strings
function int2str(y,vocab)
    y = vec(y)
    ysos = findnext(w->!isequal(w,vocab.eos), y, 1)
    ysos == nothing && return ""
    yeos = something(findnext(isequal(vocab.eos), y, ysos), 1+length(y))
    join(vocab.i2w[y[ysos:yeos-1]], " ")
end

int2str (generic function with 1 method)

In [9]:
function bleu(s2s,d::MTData)
    d = MTData(d.src,d.tgt,batchsize=1)
    reffile = d.tgt.file
    hypfile,hyp = mktemp()
    for (x,y) in progress(collect(d))
        g = s2s(x)
        for i in 1:size(y,1)
            println(hyp, int2str(g[i,:], d.tgt.vocab))
        end
    end
    close(hyp)
    isfile("multi-bleu.perl") || download("https://github.com/moses-smt/mosesdecoder/raw/master/scripts/generic/multi-bleu.perl", "multi-bleu.perl")
    run(pipeline(`cat $hypfile`,`perl multi-bleu.perl $reffile`))
    return hypfile
end

bleu (generic function with 1 method)

## S2S: Sequence to sequence model with attention

In this project we will define, train and evaluate a sequence to sequence encoder-decoder
model with attention for Turkish-English machine translation. The model has two extra
fields compared to `S2S_v1`: the `memory` layer computes keys and values from the encoder,
the `attention` layer computes the attention vector for the decoder.

In [10]:
struct Memory; w; end

struct Attention; wquery; wattn; scale; end

struct S2S
    srcembed::Embed       # encinput(B,Tx) -> srcembed(Ex,B,Tx)
    encoder::RNN          # srcembed(Ex,B,Tx) -> enccell(Dx*H,B,Tx)
    memory::Memory        # enccell(Dx*H,B,Tx) -> keys(H,Tx,B), vals(Dx*H,Tx,B)
    tgtembed::Embed       # decinput(B,Ty) -> tgtembed(Ey,B,Ty)
    decoder::RNN          # tgtembed(Ey,B,Ty) . attnvec(H,B,Ty)[t-1] = (Ey+H,B,Ty) -> deccell(H,B,Ty)
    attention::Attention  # deccell(H,B,Ty), keys(H,Tx,B), vals(Dx*H,Tx,B) -> attnvec(H,B,Ty)
    projection::Linear    # attnvec(H,B,Ty) -> proj(Vy,B,Ty)
    dropout::Real         # dropout probability
    srcvocab::Vocab       # source language vocabulary
    tgtvocab::Vocab       # target language vocabulary
end

## Load pretrained model and data

We will load a pretrained model (16.20 bleu) for code testing.  The data should be loaded
with the vocabulary from the pretrained model for word id consistency.

In [11]:
if !isdefined(Main, :pretrained) || pretrained === nothing
    @info "Loading reference model"
    isfile("s2smodel.jld2") || download("http://people.csail.mit.edu/deniz/comp542/s2smodel.jld2","s2smodel.jld2")
    pretrained = Knet.load("s2smodel.jld2","model")
end
datadir = "datasets/tr_to_en"
if !isdir(datadir)
    @info "Downloading data"
    download("http://www.phontron.com/data/qi18naacl-dataset.tar.gz", "qi18naacl-dataset.tar.gz")
    run(`tar xzf qi18naacl-dataset.tar.gz`)
end
if !isdefined(Main, :tr_vocab)
    BATCHSIZE, MAXLENGTH = 64, 50
    @info "Reading data"
    tr_vocab = pretrained.srcvocab #Vocab("tr.train_stemmized", mincount=5) #pretrained.srcvocab # Vocab("$datadir/tr.train", mincount=5)
    en_vocab = pretrained.tgtvocab #Vocab("$datadir/en.train", mincount=5)#pretrained.tgtvocab # Vocab("$datadir/en.train", mincount=5)
    tr_train = TextReader("$datadir/tr.train", tr_vocab)
    en_train = TextReader("$datadir/en.train", en_vocab)
    tr_dev = TextReader("$datadir/tr.dev", tr_vocab)
    en_dev = TextReader("$datadir/en.dev", en_vocab)
    tr_test = TextReader("$datadir/tr.test", tr_vocab)
    en_test = TextReader("$datadir/en.test", en_vocab)
    dtrn = MTData(tr_train, en_train, batchsize=BATCHSIZE, maxlength=MAXLENGTH)
    ddev = MTData(tr_dev, en_dev, batchsize=BATCHSIZE)
    dtst = MTData(tr_test, en_test, batchsize=BATCHSIZE)
end

┌ Info: Loading reference model
└ @ Main In[11]:2
┌ Info: Reading data
└ @ Main In[11]:14


MTData(TextReader("datasets/tr_to_en/tr.test", Vocab(Dict("dev" => 1277,"komuta" => 13566,"ellisi" => 25239,"adresini" => 22820,"yüzeyi" => 4051,"paris'te" => 9494,"kafamdaki" => 18790,"yüzeyinde" => 5042,"geçerlidir" => 6612,"kökten" => 7774…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "seçmemiz", "destekleyip", "karşılaştırılabilir", "ördeğin", "gününüzü", "bağışçı", "istismara", "yaşça", "tedci", "fakültesi'nde"], 2, 1, split)), TextReader("datasets/tr_to_en/en.test", Vocab(Dict("middle-income" => 13398,"photosynthesis" => 7689,"polarizing" => 17881,"henry" => 4248,"abducted" => 15691,"rises" => 6225,"hampshire" => 13888,"whiz" => 16835,"cost-benefit" => 13137,"progression" => 5549…), ["<s>", "<unk>", ",", ".", "the", "and", "to", "of", "a", "that"  …  "archaea", "handshake", "brit", "wiper", "heroines", "coca", "exceptionally", "gallbladder", "autopsies", "linguistics"], 2, 1, split)), 64, 9223372036854775807, false, 10, Array{Any,1}[[], [], [], [], [], [

## Part 1. Model constructor

The `S2S` constructor takes the following arguments:
* `hidden`: size of the hidden vectors for both the encoder and the decoder
* `srcembsz`, `tgtembsz`: size of the source/target language embedding vectors
* `srcvocab`, `tgtvocab`: the source/target language vocabulary
* `layers=1`: number of layers
* `bidirectional=false`: whether the encoder is bidirectional
* `dropout=0`: dropout probability

Hints:
* You can find the vocabulary size with `length(vocab.i2w)`.
* If the encoder is bidirectional `layers` must be even and the encoder should have `layers÷2` layers.
* The decoder will use "input feeding", i.e. it will concatenate its previous output to its input. Therefore the input size for the decoder should be `tgtembsz+hidden`.
* Only `numLayers`, `dropout`, and `bidirectional` keyword arguments should be used for RNNs, leave everything else default.
* The memory parameter `w` is used to convert encoder states to keys. If the encoder is bidirectional initialize it to a `(hidden,2*hidden)` parameter, otherwise set it to the constant 1.
* The attention parameter `wquery` is used to transform the query, set it to the constant 1 for this project.
* The attention parameter `scale` is used to scale the attention scores before softmax, set it to a parameter of size 1.
* The attention parameter `wattn` is used to transform the concatenation of the decoder output and the context vector to the attention vector. It should be a parameter of size `(hidden,2*hidden)` if unidirectional, `(hidden,3*hidden)` if bidirectional.

In [12]:
function S2S(hidden::Int, srcembsz::Int, tgtembsz::Int, srcvocab::Vocab, tgtvocab::Vocab;
             layers=1, bidirectional=false, dropout=0)
    # Your code here
    
    
    src_embedd_layer = Embed(length(srcvocab.i2w), srcembsz)
    tgt_embedd_layer = Embed(length(tgtvocab.i2w), tgtembsz)
    
    proj = Linear(hidden, length(tgtvocab.i2w))
    
    encoder_layers = layers
    w = 1
    wattn = param(hidden,2*hidden)
    if bidirectional 
        encoder_layers /= 2
        w = param(hidden,2*hidden)
        wattn = param(hidden,3*hidden)
    end
    memory = Memory(w)
    wquery = 1
    scale = param(1)
    attention = Attention(wquery, wattn, scale)
    encoder = RNN(srcembsz, hidden, numLayers = encoder_layers, bidirectional = bidirectional, dropout = dropout)
    decoder = RNN(tgtembsz+hidden, hidden, numLayers = layers, dropout = dropout)
    S2S(src_embedd_layer, encoder, memory, tgt_embedd_layer, decoder, attention, proj, dropout, srcvocab, tgtvocab)
end

S2S

In [13]:
@testset "Testing S2S constructor" begin
    H,Ex,Ey,Vx,Vy,L,Dx,Pdrop = 8,9,10,length(dtrn.src.vocab.i2w),length(dtrn.tgt.vocab.i2w),2,2,0.2
    m = S2S(H,Ex,Ey,dtrn.src.vocab,dtrn.tgt.vocab;layers=L,bidirectional=(Dx==2),dropout=Pdrop)
    @test size(m.srcembed.w) == (Ex,Vx)
    @test size(m.tgtembed.w) == (Ey,Vy)
    @test m.encoder.inputSize == Ex
    @test m.decoder.inputSize == Ey + H
    @test m.encoder.hiddenSize == m.decoder.hiddenSize == H
    @test m.encoder.direction == Dx-1
    @test m.encoder.numLayers == (Dx == 2 ? L÷2 : L)
    @test m.decoder.numLayers == L
    @test m.encoder.dropout == m.decoder.dropout == Pdrop
    @test size(m.projection.w) == (Vy,H)
    @test size(m.memory.w) == (Dx == 2 ? (H,2H) : ())
    @test m.attention.wquery == 1
    @test size(m.attention.wattn) == (Dx == 2 ? (H,3H) : (H,2H))
    @test size(m.attention.scale) == (1,)
    @test m.srcvocab === dtrn.src.vocab
    @test m.tgtvocab === dtrn.tgt.vocab
end

[37m[1mTest Summary:           | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing S2S constructor | [32m  16  [39m[36m   16[39m


Test.DefaultTestSet("Testing S2S constructor", Any[], 16, false)

## Part 2. Memory

The memory layer turns the output of the encoder to a pair of tensors that will be used as
keys and values for the attention mechanism. Remember that the encoder RNN output has size
`(H*D,B,Tx)` where `H` is the hidden size, `D` is 1 for unidirectional, 2 for
bidirectional, `B` is the batchsize, and `Tx` is the sequence length. It will be
convenient to store these values in batch major form for the attention mechanism, so
*values* in memory will be a permuted copy of the encoder output with size `(H*D,Tx,B)`
(see `@doc permutedims`). The *keys* in the memory need to have the same first dimension
as the *queries* (i.e. the decoder hidden states). So *values* will be transformed into
*keys* of size `(H,B,Tx)` with `keys = m.w * values` where `m::Memory` is the memory
layer. Note that you will have to do some reshaping to 2-D and back to 3-D for matrix
multiplications. Also note that `m.w` may be a scalar such as `1` e.g. when `D=1` and we
want keys and values to be identical.

In [14]:
mmul(w,x) = (w == 1 ? x : w == 0 ? 0 : reshape(w * reshape(x,size(x,1),:), (:, size(x)[2:end]...)))

mmul (generic function with 1 method)

In [15]:
function (m::Memory)(x)
    # Your code here 
    vals = permutedims(x, [1, 3, 2])
    vals_shape = size(vals)
    #keys = m.w * reshape(vals, vals_shape[1], :)
    keys=mmul(m.w,vals)
    keys = reshape(keys, size(keys)[1], vals_shape[2], vals_shape[3])
    return keys, vals
end

You can use the following helper function for scaling and linear transformations of 3-D tensors:

In [16]:
@testset "Testing memory" begin
    H,D,B,Tx = pretrained.encoder.hiddenSize, pretrained.encoder.direction+1, 4, 5
    x = KnetArray(randn(Float32,H*D,B,Tx))
    k,v = pretrained.memory(x)
    @test v == permutedims(x,(1,3,2))
    @test k == mmul(pretrained.memory.w, v)
end

[37m[1mTest Summary:  | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing memory | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing memory", Any[], 2, false)

## Part 3. Encoder

`encode()` takes a model `s` and a source language minibatch `src`. It passes the input
through `s.srcembed` and `s.encoder` layers with the `s.encoder` RNN hidden states
initialized to `0` in the beginning, and copied to the `s.decoder` RNN at the end. The
steps so far are identical to `S2S_v1` but there is an extra step: The encoder output is
passed to the `s.memory` layer which returns a `(keys,values)` pair. `encode()` returns
this pair to be used later by the attention mechanism.

In [17]:
function encode(s::S2S, src)
    # Your code here
    s.encoder.c, s.encoder.h = 0, 0
    src_embed_out = s.srcembed(src) #;@show typeof(src_embed_out),size(src_embed_out)
    enc_out = s.encoder(src_embed_out) #;@show s.encoder
    s.decoder.h = s.encoder.h
    s.decoder.c = s.encoder.c
    keys, values = s.memory(enc_out)
    return keys, values
end

encode (generic function with 1 method)

In [18]:
@testset "Testing encoder" begin
    src1,tgt1 = first(dtrn)
    key1,val1 = encode(pretrained, src1)
    H,D,B,Tx = pretrained.encoder.hiddenSize, pretrained.encoder.direction+1, size(src1,1), size(src1,2)
    @test size(key1) == (H,Tx,B)
    @test size(val1) == (H*D,Tx,B)
    @test (pretrained.decoder.h,pretrained.decoder.c) === (pretrained.encoder.h,pretrained.encoder.c)
    @test norm(key1) ≈ 1214.4755f0
    @test norm(val1) ≈ 191.10411f0
    @test norm(pretrained.decoder.h) ≈ 48.536964f0
    @test norm(pretrained.decoder.c) ≈ 391.69028f0
end

[37m[1mTest Summary:   | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing encoder | [32m   7  [39m[36m    7[39m


Test.DefaultTestSet("Testing encoder", Any[], 7, false)

## Part 4. Attention

The attention layer takes `cell`: the decoder output, and `mem`: a pair of (keys,vals)
from the encoder, and computes and returns the attention vector. First `a.wquery` is used
to linearly transform the cell to the query tensor. The query tensor is reshaped and/or
permuted as appropriate and multiplied with the keys tensor to compute the attention
scores. Please see `@doc bmm` for the batched matrix multiply operation used for this
step. The attention scores are scaled using `a.scale` and normalized along the time
dimension using `softmax`. After the appropriate reshape and/or permutation, the scores
are multiplied with the `vals` tensor (using `bmm` again) to compute the context
tensor. After the appropriate reshape and/or permutation the context vector is
concatenated with the cell and linearly transformed to the attention vector using
`a.wattn`. Please see the paper and code examples for details.

Note: the paper mentions a final `tanh` transform, however the final version of the
reference code does not use `tanh` and gets better results. Therefore we will skip `tanh`.

In [19]:
function (a::Attention)(cell, mem)
    # Your code here
    keys, vals = mem
    #println(size(keys), size(vals), size(cell), size(a.wquery))
    query_tensor = a.wquery * cell
    #println(size(query_tensor), size(keys))
    attn_scores = bmm(permutedims(query_tensor, [3, 1, 2]), keys)
    #println(size(attn_scores))
    attn_scores =  softmax(attn_scores * a.scale[1], dims = 2)
    #println(size(attn_scores), size(vals))
    context_tensor = bmm(attn_scores, permutedims(vals, [2, 1, 3]))
    #println(size(context_tensor), size(cell), size(a.wattn))
    tmp_vec = vcat(cell, permutedims(context_tensor, [2, 3, 1]))
    #println(size(tmp_vec))
    tmp_shape = size(tmp_vec)
    #attn_vec = a.wattn * reshape(tmp_vec, tmp_shape[1], :)
    attn_vec=mmul(a.wattn,tmp_vec)
    return reshape(attn_vec, size(attn_vec)[1], tmp_shape[2], tmp_shape[3])
end

In [20]:
@testset "Testing attention" begin
    src1,tgt1 = first(dtrn)
    key1,val1 = encode(pretrained, src1)
    H,B = pretrained.encoder.hiddenSize, size(src1,1)
    Knet.seed!(1)
    x = KnetArray(randn(Float32,H,B,5))
    y = pretrained.attention(x, (key1, val1))
    @test size(y) == size(x)
    @test norm(y) ≈ 808.381f0
end

[37m[1mTest Summary:     | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing attention | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing attention", Any[], 2, false)

## Part 5. Decoder

`decode()` takes a model `s`, a target language minibatch `tgt`, the memory from the
encoder `mem` and the decoder output from the previous time step `prev`. After the input
is passed through the embedding layer, it is concatenated with `prev` (this is called
input feeding). The resulting tensor is passed through `s.decoder`. Finally the
`s.attention` layer takes the decoder output and the encoder memory to compute the
"attention vector" which is returned by `decode()`.

In [21]:
function decode(s::S2S, tgt, mem, prev)
    # Your code here
    tgt_embed_out = s.tgtembed(tgt) #; @show size(tgt_embed_out) , size(prev)
    #println(typeof(tgt_embed_out), typeof(prev))
    input_feed = vcat(tgt_embed_out, prev)    
    y_de = s.decoder(input_feed) #; @show size(y_de,1),size(y_de,2),size(y_de,3);
    attn_vec = s.attention(y_de, mem)
end

decode (generic function with 1 method)

In [22]:
@testset "Testing decoder" begin
    src1,tgt1 = first(dtrn)
    key1,val1 = encode(pretrained, src1)
    H,B = pretrained.encoder.hiddenSize, size(src1,1)
    Knet.seed!(1)
    cell = randn!(similar(key1, size(key1,1), size(key1,3), 1))
    cell = decode(pretrained, tgt1[:,1:1], (key1,val1), cell)
    @test size(cell) == (H,B,1)
    @test norm(cell) ≈ 131.21631f0
end

[37m[1mTest Summary:   | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing decoder | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing decoder", Any[], 2, false)

## Part 6. Loss

The loss function takes source language minibatch `src`, and a target language minibatch
`tgt` and returns `sumloss/numwords` if `average=true` or `(sumloss,numwords)` if
`average=false` where `sumloss` is the total negative log likelihood loss and `numwords` is
the number of words predicted (including a final eos for each sentence). The source is first
encoded using `encode` yielding a `(keys,vals)` pair (memory). Then the decoder is called to
predict each word of `tgt` given the previous word, `(keys,vals)` pair, and the previous
decoder output. The previous decoder output is initialized with zeros for the first
step. The output of the decoder at each step is passed through the projection layer giving
word scores. Losses can be computed from word scores and masked/shifted `tgt`.

In [23]:
function (s::S2S)(src, tgt; average=true)
    # Your code here
    e_keys,e_vals=encode(s,src)    # ;@show size(e_keys) , size(e_vals)
    pre=KnetArray(zeros(Float32, size(e_keys,1), size(e_keys,3), 1)) #; @show size(e_keys)
    scores=[]
    for x in 1:size(tgt[:,1:end-1],2) 
        pre = decode(s,tgt[:,x:x],(e_keys,e_vals), pre)
        t = reshape(pre,size(pre,1),size(pre,2)*size(pre,3)) #; @show typeof(t)
        push!(scores,s.projection(t))  #; @show typeof(scores),size(s.projection(t))
    end
    scores = hcat(scores...) # V, Ty*B
    #sc=reshape(scores, size(scores,1), size(e_keys,3), size(tgt[:,1:end-1],2)) #; @show size(sc)
    target=tgt[:,2:end] #;@show size(target)
    
    mask!(target,s.tgtvocab.eos) #;@show size(target)
    target=reshape(target,:) #;@show size(target)
    nll(scores,target,average=average)
end

In [24]:
@testset "Testing loss" begin
    src1,tgt1 = first(dtrn)
    @test pretrained(src1,tgt1) ≈ 1.4666592f0
    @test all(pretrained(src1,tgt1,average=false) .≈ (1949.1901f0, 1329))
end

[37m[1mTest Summary: | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing loss  | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing loss", Any[], 2, false)

## Part 7. Greedy translator

An `S2S` object can be called with a single argument (source language minibatch `src`, with
size `B,Tx`) to generate translations (target language minibatch with size `B,Ty`). The
keyword argument `stopfactor` determines how much longer the output can be compared to the
input. Similar to the loss function, the source minibatch is encoded yield a `(keys,vals)`
pair (memory). We generate the output one time step at a time by calling the decoder with
the last output, the memory, and the last decoder state. The last output is initialized to
an array of `eos` tokens and the last decoder state is initialized to an array of
zeros. After computing the scores for the next word using the projection layer, the highest
scoring words are selected and appended to the output. The generation stops when all outputs
in the batch have generated `eos` or when the length of the output is `stopfactor` times the
input.

In [25]:
function (s::S2S)(src; stopfactor = 3)
    # Your code here
    #B,Tx -> B,Ty
    batch_size = size(src)[1] #;@show batch_size
    src_length = size(src)[2] #;@show src_length
    e_keys,e_vals=encode(s,src) #;@show size(e_keys) , size(e_vals)
    
    las_opt = repeat([s.tgtvocab.eos] ,batch_size) #; @show size(las_opt),las_opt
    las_opt = reshape(las_opt, (size(las_opt)[1], 1))
    last_st=KnetArray{Float32}(zeros(size(e_keys,1),batch_size, 1)) #; @show size(last_st)

    trans_sens = []
    eos_cond = repeat([false], batch_size)
    while !all(eos_cond) 
        #println(last_st, last_opt)
        decoder_output = decode(s,las_opt,(e_keys,e_vals),last_st) ;# @show size(decoder_output)
        hidden_state = reshape(decoder_output,size(decoder_output,1),size(decoder_output,2)*size(decoder_output,3))
        scores = s.projection(hidden_state)                #;@show size(scores)
        last_st = decoder_output
        tokens = vec(map(x -> x[1], argmax(scores, dims=1)))# ;@show size(tokens)
        las_opt = reshape(tokens, size(tokens, 1), 1)
        push!(trans_sens, tokens)
        eos_cond[findall(i->i==s.tgtvocab.eos, tokens)] .= true
        if length(trans_sens) >= stopfactor * size(src,2)
            break
        end
    end
    return hcat(trans_sens...)
end

In [26]:
@testset "Testing translator" begin
    src1,tgt1 = first(dtrn)
    tgt2 = pretrained(src1)
    @test size(tgt2) == (64, 41)
    @test tgt2[1:3,1:3] == [14 25 10647; 37 25 1426; 27 5 349]
end

[37m[1mTest Summary:      | [22m[39m[32m[1mPass  [22m[39m[36m[1mTotal[22m[39m
Testing translator | [32m   2  [39m[36m    2[39m


Test.DefaultTestSet("Testing translator", Any[], 2, false)

## Part 8. Training

`trainmodel` creates, trains and returns an `S2S` model. The arguments are described in
comments.

In [27]:
function trainmodel(trn,                  # Training data
                    dev,                  # Validation data, used to determine the best model
                    tst...;               # Zero or more test datasets, their loss will be periodically reported
                    bidirectional = true, # Whether to use a bidirectional encoder
                    layers = 2,           # Number of layers (use `layers÷2` for a bidirectional encoder)
                    hidden = 512,         # Size of the hidden vectors
                    srcembed = 512,       # Size of the source language embedding vectors
                    tgtembed = 512,       # Size of the target language embedding vectors
                    dropout = 0.2,        # Dropout probability
                    epochs = 0,           # Number of epochs (one of epochs or iters should be nonzero for training)
                    iters = 0,            # Number of iterations (one of epochs or iters should be nonzero for training)
                    bleu = false,         # Whether to calculate the BLEU score for the final model
                    save = false,         # Whether to save the final model
                    seconds = 60,         # Frequency of progress reporting
                    )
    @show bidirectional, layers, hidden, srcembed, tgtembed, dropout, epochs, iters, bleu, save; flush(stdout)
    model = S2S(hidden, srcembed, tgtembed, trn.src.vocab, trn.tgt.vocab;
                layers=layers, dropout=dropout, bidirectional=bidirectional)

    epochs == iters == 0 && return model

    (ctrn,cdev,ctst) = collect(trn),collect(dev),collect.(tst)
    traindata = (epochs > 0
                 ? collect(flatten(shuffle!(ctrn) for i in 1:epochs))
                 : shuffle!(collect(take(cycle(ctrn), iters))))

    bestloss, bestmodel = loss(model, cdev), deepcopy(model)
    progress!(adam(model, traindata), seconds=seconds) do y
        devloss = loss(model, cdev)
        tstloss = map(d->loss(model,d), ctst)
        if devloss < bestloss
            bestloss, bestmodel = devloss, deepcopy(model)
        end
        println(stderr)
        (dev=devloss, tst=tstloss, mem=Float32(CuArrays.usage[]))
    end
    save && Knet.save("attn-$(Int(time_ns())).jld2", "model", bestmodel)
    bleu && Main.bleu(bestmodel,dev)
    return bestmodel
end

trainmodel (generic function with 1 method)

Train a model: If your implementation is correct, the first epoch should take about 24
minutes on a v100 and bring the loss from 9.83 to under 4.0. 10 epochs would take about 4
hours on a v100. With other GPUs you may have to use a smaller batch size (if memory is
lower) and longer time (if gpu speed is lower).

In [31]:
# Uncomment the appropriate option for training:
#model = pretrained  # Use reference model
 model = Knet.load("attn-1920283433023258.jld2", "model")  # Load pretrained model
#model = trainmodel(dtrn,ddev,take(dtrn,20); epochs=10, save=true, bleu=true)  # Train model

S2S(Embed(P(KnetArray{Float32,2}(512,38126))), LSTM(input=512,hidden=512,bidirectional,dropout=0.2), Memory(P(KnetArray{Float32,2}(512,1024))), Embed(P(KnetArray{Float32,2}(512,18857))), LSTM(input=1024,hidden=512,layers=2,dropout=0.2), Attention(1, P(KnetArray{Float32,2}(512,1536)), P(KnetArray{Float32,1}(1))), Linear(P(KnetArray{Float32,2}(18857,512)), P(KnetArray{Float32,1}(18857))), 0.2, Vocab(Dict("dev" => 1277,"komuta" => 13566,"ellisi" => 25239,"adresini" => 22820,"yüzeyi" => 4051,"paris'te" => 9494,"kafamdaki" => 18790,"yüzeyinde" => 5042,"geçerlidir" => 6612,"kökten" => 7774…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "seçmemiz", "destekleyip", "karşılaştırılabilir", "ördeğin", "gününüzü", "bağışçı", "istismara", "yaşça", "tedci", "fakültesi'nde"], 2, 1, split), Vocab(Dict("middle-income" => 13398,"photosynthesis" => 7689,"polarizing" => 17881,"henry" => 4248,"abducted" => 15691,"rises" => 6225,"hampshire" => 13888,"whiz" => 16835,"cost-benefit" => 

Code to sample translations from a dataset

In [32]:
data1 = MTData(tr_dev, en_dev, batchsize=1) |> collect;
function translate_sample(model, data)
    (src,tgt) = rand(data)
    out = model(src)
    println("SRC: ", int2str(src,model.srcvocab))
    println("REF: ", int2str(tgt,model.tgtvocab))
    println("OUT: ", int2str(out,model.tgtvocab))
end

translate_sample (generic function with 1 method)

Generate translations for random instances from the dev set

In [33]:
translate_sample(model, data1)

SRC: sistemi işletiyor .
REF: and he works this system .
OUT: it runs system .


Code to generate translations from user input

In [32]:
function translate_input(model)
    v = model.srcvocab
    src = [ get(v.w2i, w, v.unk) for w in v.tokenizer(readline()) ]'
    out = model(src)
    println("SRC: ", int2str(src,model.srcvocab))
    println("OUT: ", int2str(out,model.tgtvocab))
end

translate_input (generic function with 1 method)

Generate translations for user input

In [33]:
# translate_input(model)

## Competition

The reference model `pretrained` has 16.2 bleu. By playing with the optimization algorithm
and hyperparameters, using per-sentence loss, and (most importantly) splitting the Turkish
words I was able to push the performance to 21.0 bleu. I will give extra credit to groups
that can exceed 21.0 bleu in this dataset.

*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*