# Text-GAN Turkish word generation

In [1]:
# run(`curl -o asafaya.omutlu.jld2 "https://doc-0g-70-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/qc1fmd13iog7eno06jm2f54mtssrc19l/1574712000000/14279118924456930669/*/1GNdxziPk0mK4tcWiCecPnvSH-jBEX9tk?e=download"`)

using Knet, Test, Base.Iterators, Printf, LinearAlgebra, CuArrays, Random, IterTools

struct Vocab
    w2i::Dict{String,Int}
    i2w::Vector{String}
    unk::Int
    eos::Int
    tokenizer
end

function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    # set unk and eos tokens frequency to inf because
    # we don't want them to be removed from the vocab set
    cdict = Dict(eos => Inf, unk=>Inf) 
    
    # create vocab set and count occurrences
    for l in eachline(file)
        tokens = tokenizer(l)
        map(w -> cdict[w] = get!(cdict, w, 0) + 1, tokens)
    end
    
    # select words with frequency higher than mincount
    # sort by frequency and delete if vocabsize is determined
    fsorted = sort([ (w, c) for (w, c) in cdict if c >= mincount ], by = x -> x[2], rev = true)
    
    vocabsize == Inf || (fsorted = fsorted[1:vocabsize])

    i2w = [ eos; unk; [ x[1] for x in fsorted[3:end] ] ]
    w2i = Dict( w => i for (i, w) in enumerate(i2w))                
    
    return Vocab(w2i, i2w, w2i[unk], w2i[eos], tokenizer)
end
                
struct TextReader
    file::String
    vocab::Vocab
end
                
function Base.iterate(r::TextReader, s=nothing)
    s === nothing && (s = open(r.file))
    eof(s) && return close(s)
    return [ get(r.vocab.w2i, w, r.vocab.unk) for w in r.vocab.tokenizer(readline(s))], s
end
                
Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

struct WordsData
    src::TextReader        
    batchsize::Int         
    maxlength::Int         
    batchmajor::Bool       
    bucketwidth::Int    
    buckets::Vector        
end

function WordsData(src::TextReader; batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 2, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    WordsData(src, batchsize, maxlength, batchmajor, bucketwidth, buckets)
end

Base.IteratorSize(::Type{WordsData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{WordsData}) = Base.HasEltype()
Base.eltype(::Type{WordsData}) = NTuple{2}

function Base.iterate(d::WordsData, state=nothing)
    if state == 0 # When file is finished but buckets are partially full 
        for i in 1:length(d.buckets)
            if length(d.buckets[i]) > 0
                buc = d.buckets[i]
                d.buckets[i] = []
                return buc, state
            end
        end
        return nothing # Finish iteration
    end

    while true
        src_next = iterate(d.src, state)
        
        if src_next === nothing
            state = 0
            return iterate(d, state)
        end
        
        (src_word, src_state) = src_next
        state = src_state
        src_length = length(src_word)
        
        (src_length > d.maxlength) && continue

        i = Int(ceil(src_length / d.bucketwidth))
        i > length(d.buckets) && (i = length(d.buckets))

        push!(d.buckets[i], src_word)
        if length(d.buckets[i]) == d.batchsize
            buc = d.buckets[i]
            d.buckets[i] = []
            return buc, state
        end
    end
end

function readwordset(fname)
    words = []
    fi = open(fname)
    while !eof(fi)
        push!(words, readline(fi))
    end
    close(fi)
    words
end

readwordset (generic function with 1 method)

### G/D Common Parts

In [2]:
struct Embed; w; end

function Embed(shape...)
    Embed(param(shape...))
end

# per-word loss (in this case per-batch loss)
function loss(model, data; average=true)
    l = 0
    n = 0
    a = 0
    for (x, y) in data
        v = model(x, y; average=false)
        l += v[1]
        n += v[2]
        a += (v[1] / v[2])
    end
    average && return a
    return l, n
end

loss (generic function with 1 method)

## Recurrent Discriminator

In [3]:
function (l::Embed)(x)
    dims = size(x)
    em = l.w * reshape(x, dims[1], dims[2] * dims[3]) # reshape for multiplication 
    em = reshape(em, size(em, 1), dims[2], dims[3]) # reshape to original size
end

struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 3-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

mutable struct DisModel
    vocab::Vocab
    embed::Embed
    rnn::RNN
    denselayers
end

# This discriminator uses separate weights for its embedding layer
function DisModel(vocab, embeddingSize::Int, hidden, denselayers; layers=1, dropout=0)
    Em = Embed(embeddingSize, length(vocab.w2i))
    rnn = RNN(embeddingSize, hidden; numLayers=layers, dropout=dropout)
    DisModel(vocab, Em, rnn, denselayers)
end

function (c::DisModel)(x) # the input here is weights of the characters with shape (C, B, T)
    c.rnn.h, c.rnn.c = 0, 0
    em = c.embed(x)
    rnn_out = permutedims(c.rnn(em), [1, 3, 2])
    for l in c.denselayers
        rnn_out = l(rnn_out)
    end
    rnn_out
end

(c::DisModel)(x,y; average=true) = nll(c(x), y; average=average)

## Generator

In [4]:
get_z(shape...) = KnetArray(randn(Float32, shape...))

### Not used 
# concatenate z with embedding vectors, z -> (z_size, B), returns (E+z_size, B, T)
# this will be used to feed Z to generator at each timestep
# function (l::Embed)(x, z)
#     em = l.w[:, x]
#     z_array = cat((z for i in 1:size(em, 3))...; dims=(3))
#     cat(em, z_array; dims=(1))
# end

# Generator model
struct GenModel
    projection::Embed
    rnn::RNN        
    dropout::Real
    vocab::Vocab
    disModel::DisModel
    maxlength::Int
end

function GenModel(inputsize::Int, hidden::Int, vocab::Vocab, disModel::DisModel, maxlength::Int; layers=2, dropout=0)
    rnn = RNN(inputsize, hidden; numLayers=layers, dropout=dropout)
    projection = Embed(hidden, length(vocab.i2w))
    GenModel(projection, rnn, dropout, vocab, disModel, maxlength)
end

# This generator shares the projection layers weights of the discriminator for its projection layer
function GenModel(inputsize::Int, vocab::Vocab, disModel::DisModel, maxlength::Int; layers=2, dropout=0)
    rnn = RNN(inputsize, size(disModel.embed.w, 1); numLayers=layers, dropout=dropout)
    GenModel(disModel.embed, rnn, dropout, vocab, disModel, maxlength)
end

function Z(s::GenModel, batchsize, timesteps)
    z = get_z(s.rnn.inputSize, batchsize, 1) # according to get_z(H, B, layers)
    return cat([ z for i in 1:timesteps]...;dims=3)
end

# Generator forward pass, size(Z) -> inputsize, batchsize, sequencelength
function (s::GenModel)(Z)
    s.rnn.h, s.rnn.c = 0, 0
    rnn_out = s.rnn(Z) 
    dims = size(rnn_out)
    output = s.projection.w' * dropout(reshape(rnn_out, dims[1], dims[2] * dims[3]), s.dropout)
    reshape(softmax(output), size(output, 1), dims[2], dims[3])
end

# Generator loss
function (s::GenModel)(Z, calculateloss::Int; average=true)
    y = Array(ones(Int, size(Z, 2))) # create labels 1 -> real, 2-> not-real
    x = s(Z)
    pads = KnetArray(zeros(Float32, size(x, 1), size(x, 2), s.maxlength - size(x, 3)))
    pads[s.vocab.eos, :, :] .= 1
    x = cat(x, pads; dims=3) # padding
    return s.disModel(x, y;average=average) 
end

function generate(s::GenModel, maxlength, batchsize)
    out = s(Z(s, batchsize, maxlength))
    words = []
    for i in 1:batchsize
        push!(words, join([s.vocab.i2w[x[1]] for x in argmax(out[:, i, :]; dims=1)], " "))
    end
    words
end

generate (generic function with 1 method)

## Word Sampler

In [5]:
struct Sampler
    wordsdata::WordsData
    vocab::Vocab
    genModel::GenModel
    maxBatchsize::Int
end

# this function is similar to gumble softmax, it is used to soften the one-hot-vector of the real samples
# tau -> normalization factor; the bigger the softer
function soften(A; dims=1, tau=2.0) 
    A = A ./ tau
    softmax(A; dims=dims)
end

Base.IteratorSize(::Type{Sampler}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{Sampler}) = Base.HasEltype()
Base.eltype(::Type{Sampler}) = Tuple{KnetArray{Float32,3},Array{Int64,1}}

function Base.iterate(s::Sampler, state=nothing)
    wdatastate = iterate(s.wordsdata, state)
    wdatastate === nothing && (return nothing)
    
    (bucket, state) = wdatastate
    bsize = length(bucket)
    src_eow = s.vocab.eos
    src_lengths = map(x -> length(x), bucket)
    max_length = max(src_lengths...)
    gsize = 1 + rand(bsize:s.maxBatchsize) - bsize # count of words to be generated
    generated = s.genModel(Z(s.genModel, gsize, max_length))

    to_be_cat = [generated, ]
    for (i, v) in enumerate(bucket)
        tindex = [i for i in 1:length(v)]
        onehot = KnetArray(zeros(Float32, length(s.vocab.w2i), 1, max_length))
        onehot[v, :, tindex] .= 1
        onehot = soften(onehot) # soften one hot vectors elements value
        push!(to_be_cat, onehot)
    end
    x = cat(to_be_cat...;dims=2) # concatenate both generated and sampled words
    pads = KnetArray(zeros(Float32, size(x, 1), size(x, 2), s.wordsdata.maxlength - size(x, 3)))
    pads[src_eow, :, :] .= 1
    
#     pads = soften(pads) # getting error 

    x = cat(x, pads; dims=3) # padding
    y = Array(ones(Int, gsize+bsize)) # create labels 1 -> real, 2-> not-real
    y[1:gsize] = y[1:gsize] .+ 1
    
    ind = shuffle(1:gsize+bsize) # used to shuffle the batch
    x, y = x[:, ind, :], y[ind]
    return (x,y), state
end

In [6]:
function train!(model, parameters, trn, dev, tst...)
    bestmodel, bestloss = deepcopy(model), loss(model, dev)
    progress!(adam(model, trn; params=parameters), seconds=30) do y
        devloss = loss(model, dev)
        tstloss = map(d->loss(model,d), tst)
        if devloss < bestloss
            bestloss, bestmodel = devloss, deepcopy(model)
        end
        println(stderr)
        (dev=devloss, tst=tstloss, mem=Float32(CuArrays.usage[]))
    end
    return bestmodel
end

train! (generic function with 1 method)

In [7]:
# per-word loss (in this case per-batch loss)
function loss(model, data; average=true)
    l = 0
    n = 0
    a = 0
    for (x, y) in data
        v = model(x, y; average=false)
        l += v[1]
        n += v[2]
        a += (v[1] / v[2])
    end
    average && return a
    return l, n
end

loss (generic function with 1 method)

In [8]:
datadir = "turkish_text"

BATCHSIZE = 32
MAXLENGTH = 10

tr_vocab = Vocab("$datadir/tr.train", mincount=30)
println("Vocab size ", length(tr_vocab.i2w))
tr_trn = TextReader("$datadir/tr.train", tr_vocab)
tr_dev = TextReader("$datadir/tr.dev", tr_vocab)
dtrn = WordsData(tr_trn, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 1)
ddev = WordsData(tr_dev, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 1)

Vocab size 7700


WordsData(TextReader("turkish_text/tr.dev", Vocab(Dict("dev" => 1277,"metan" => 5735,"yüzeyi" => 4051,"görüntüleri" => 3122,"yaşından" => 4777,"yüzeyinde" => 5042,"birçoğu" => 2867,"geçerlidir" => 6612,"2009'da" => 6885,"kenar" => 5186…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "tamamının", "enstrüman", "yapmamızı", "hayranlık", "koruyacak", "sebebiyet", "izleyiciler", "köye", "ilişkilerini", "silahların"], 2, 1, split)), 32, 10, false, 1, Array{Any,1}[[], [], [], [], [], [], [], [], [], []])

In [9]:
EMBEDDING_SIZE = 128
DHIDDEN_SIZE = 256
GDROPOUT = 0.2
DDROPOUT = 0.2

dismodel = DisModel(tr_vocab, EMBEDDING_SIZE, DHIDDEN_SIZE,(
        Dense(DHIDDEN_SIZE * MAXLENGTH, 16, pdrop=DDROPOUT),
        Dense(16, 2, sigm, pdrop=0.2)
        ); dropout=DDROPOUT)

GE_SIZE = 256
Z_SIZE = 128

genmodel = GenModel(Z_SIZE, GE_SIZE, tr_vocab, dismodel, MAXLENGTH; dropout=GDROPOUT, layers=2)
trnsampler = Sampler(dtrn, tr_vocab, genmodel, BATCHSIZE * 2)
devsampler = Sampler(ddev, tr_vocab, genmodel, BATCHSIZE * 2)

Sampler(WordsData(TextReader("turkish_text/tr.dev", Vocab(Dict("dev" => 1277,"metan" => 5735,"yüzeyi" => 4051,"görüntüleri" => 3122,"yaşından" => 4777,"yüzeyinde" => 5042,"birçoğu" => 2867,"geçerlidir" => 6612,"2009'da" => 6885,"kenar" => 5186…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "tamamının", "enstrüman", "yapmamızı", "hayranlık", "koruyacak", "sebebiyet", "izleyiciler", "köye", "ilişkilerini", "silahların"], 2, 1, split)), 32, 10, false, 1, Array{Any,1}[[], [], [], [], [], [], [], [], [], []]), Vocab(Dict("dev" => 1277,"metan" => 5735,"yüzeyi" => 4051,"görüntüleri" => 3122,"yaşından" => 4777,"yüzeyinde" => 5042,"birçoğu" => 2867,"geçerlidir" => 6612,"2009'da" => 6885,"kenar" => 5186…), ["<s>", "<unk>", ".", ",", "bir", "ve", "bu", "''", "``", "için"  …  "tamamının", "enstrüman", "yapmamızı", "hayranlık", "koruyacak", "sebebiyet", "izleyiciler", "köye", "ilişkilerini", "silahların"], 2, 1, split), GenModel(Embed(P(KnetArray{Float32,2}(256,7700))), LST

In [10]:
function gmodel(epochs)
    global genmodel
    global BATCHSIZE
    global MAXLENGTH
    
    ctrn = [ (Z(genmodel, BATCHSIZE, MAXLENGTH), 1) for i in 1:500 ]
    trnxepoch = collect(flatten(shuffle!(ctrn) for i in 1:epochs))
    trnmini = ctrn[1:20]
    dev = [ (Z(genmodel, BATCHSIZE, MAXLENGTH), 1) for i in 1:100 ]
    genmodel = train!(genmodel, params(genmodel)[1:2], trnxepoch, dev, trnmini)
end

function dmodel(batches)
    global trnsampler
    global devsampler
    global dismodel
    
    ctrn = collect(trnsampler)
    trnmini = ctrn[1:20]
    ctrn = shuffle!(ctrn)[1:batches]
    dev = collect(devsampler)
    dismodel = train!(dismodel, params(dismodel), ctrn, dev, trnmini) 
end

@info "Started training..."
for k in 1:20
    println("Turn no:", k)
    println("Ex.Generated words: ", join(generate(genmodel, MAXLENGTH, 5),"\n"))

    println("Training Discriminator:")
    dmodel(Int(ceil(rand() * 30)))
    println("Training Generator:")
    gmodel(Int(ceil(rand() * 10)))
end

┌ Info: Started training...
└ @ Main In[10]:25


Turn no:1
Ex.Generated words: özgür özgür özgür özgür özgür özgür özgür özgür özgür özgür
sarı sarı sarı sarı sarı sarı sarı yazmayı yazmayı bozucu
tarihi tarihi artmaya artmaya artmaya artmaya artmaya artmaya artmaya artmaya
cıvık cıvık dersleri dersleri dersleri dersleri dersleri hücreleri hücreleri hücreleri
meme meme okuduğum okuduğum okuduğum okuduğum olurlar olurlar olurlar olurlar
Training Discriminator:
Effective GPU memory usage: 99.95% (31.704 GiB/31.719 GiB)
CuArrays GPU memory usage: 30.967 GiB
BinnedPool usage: 30.967 GiB (30.967 GiB allocated, 0 bytes cached)
BinnedPool efficiency: 67.24% (20.822 GiB requested, 30.967 GiB allocated)


┌ Error: Out of GPU memory trying to allocate 17.624 MiB
└ @ CuArrays /kuacc/users/asafaya19/.julia/packages/CuArrays/4ZX56/src/memory.jl:125


OutOfMemoryError: OutOfMemoryError()

In [52]:
println("Ex.Generated words: ", join(generate(genmodel, MAXLENGTH, 5),"\n"))

Ex.Generated words: lPjjjjj
sssssss
ssssss
ssssss
ssssss
