## Character language model for Turkish with LSTM

In [1]:
using Knet, Test, Base.Iterators, Printf, LinearAlgebra, CuArrays, Random, IterTools, StatsBase

┌ Info: Recompiling stale cache file /kuacc/users/asafaya19/.julia/compiled/v1.2/Knet/f4vSz.ji for Knet [1902f260-5fb4-5aff-8c31-6271790ab950]
└ @ Base loading.jl:1240


In [2]:
struct Charset
    c2i::Dict{Any,Int}
    i2c::Vector{Any}
    eow::Int
end

function Charset(charset::String; eow="")
    i2c = [ eow; [ c for c in charset ]  ]
    print(i2c)
    c2i = Dict( c => i for (i, c) in enumerate(i2c))
    return Charset(c2i, i2c, c2i[eow])
end

struct TextReader
    file::String
    charset::Charset
end

function Base.iterate(r::TextReader, s=nothing)
    s === nothing && (s = open(r.file))
    eof(s) && return close(s)
    return [ get(r.charset.c2i, c, r.charset.eow) for c in readline(s)], s
end

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

struct Embed; w; end

function Embed(charsetsize::Int, embedsize::Int)
    Embed(param(embedsize, charsetsize))
end

function (l::Embed)(x)
    l.w[:, x]
end

struct Linear; w; end

function Linear(inputsize::Int, outputsize::Int)
    Linear(param(outputsize, inputsize))
end

function (l::Linear)(x)
    l.w * x
end

function mask(a, pad)
    a = copy(a)
    for i in 1:size(a, 1)
        j = size(a,2)
        while a[i, j] == pad && j > 1
            if a[i, j - 1] == pad
                a[i, j] = 0
            end
            j -= 1
        end
    end
    return a
end

struct WordsData
    src::TextReader        
    batchsize::Int         
    maxlength::Int         
    batchmajor::Bool       
    bucketwidth::Int    
    buckets::Vector        
    batchmaker::Function   
end

function WordsData(src::TextReader; batchmaker = arraybatch, batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 2, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    WordsData(src, batchsize, maxlength, batchmajor, bucketwidth, buckets, batchmaker)
end

Base.IteratorSize(::Type{WordsData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{WordsData}) = Base.HasEltype()
Base.eltype(::Type{WordsData}) = NTuple{2}

function Base.iterate(d::WordsData, state=nothing)
    if state == 0 # When file is finished but buckets are partially full 
        for i in 1:length(d.buckets)
            if length(d.buckets[i]) > 0
                batch = d.batchmaker(d, d.buckets[i])
                d.buckets[i] = []
                return batch, state
            end
        end
        return nothing # Finish iteration
    elseif state === nothing
        # Just to make sure
        for i in 1:length(d.buckets)
            d.buckets[i] = []
        end
        state = nothing
    end

    while true
        src_next = iterate(d.src, state)
        
        if src_next === nothing
            state = 0
            return iterate(d, state)
        end
        
        (src_word, src_state) = src_next
        state = src_state
        src_length = length(src_word)
        
        (src_length > d.maxlength) && continue

        i = Int(ceil(src_length / d.bucketwidth))
        i > length(d.buckets) && (i = length(d.buckets))

        push!(d.buckets[i], src_word)
        if length(d.buckets[i]) == d.batchsize
            batch = d.batchmaker(d, d.buckets[i])
            d.buckets[i] = []
            return batch, state
        end
    end
end

function arraybatch(d::WordsData, bucket)
    src_eow = d.src.charset.eow
    src_lengths = map(x -> length(x), bucket)
    max_length = max(src_lengths...)
    x = zeros(Int64, length(bucket), max_length + 2) # default d.batchmajor is false

    for (i, v) in enumerate(bucket)
        to_be_added = fill(src_eow, max_length - length(v) + 1)
        x[i,:] = [src_eow; v; to_be_added]
    end

    
    d.batchmajor && (x = x')
    return (x[:, 1:end-1], x[:, 2:end])
end

# per-word loss (in this case per-batch loss)
function loss(model, data; average=true)
    l = 0
    n = 0
    a = 0
    for (x, y) in data
        v = model(x, y; average=false)
        l += v[1]
        n += v[2]
        a += (v[1] / v[2])
    end
    average && return a
    return l, n
end

# Utility to convert int arrays to sentence strings
function int2word(y, charset)
    y = vec(y)
    ysos = findnext(w->!isequal(w, charset.eow), y, 1)
    ysos == nothing && return ""
    yeos = something(findnext(isequal(charset.eow), y, ysos), 1+length(y))
    join(charset.i2c[y[ysos:yeos-1]], " ")
end

int2word (generic function with 1 method)

In [3]:
char_set = "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÇÖÜçöüĞğİıŞş"
datadir = "turkish_word_set"

BATCHSIZE, MAXLENGTH = 32, 25
@info "Reading data"
tr_charset = Charset(char_set)
tr_train = TextReader("$datadir/train.tr", tr_charset)
tr_dev = TextReader("$datadir/dev.tr", tr_charset)
dtrn = WordsData(tr_train, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 3)
ddev = WordsData(tr_dev, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 3)

┌ Info: Reading data
└ @ Main In[3]:5


Any["", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'y', 'z', 'Ç', 'Ö', 'Ü', 'ç', 'ö', 'ü', 'Ğ', 'ğ', 'İ', 'ı', 'Ş', 'ş']

WordsData(TextReader("turkish_word_set/dev.tr", Charset(Dict{Any,Int64}('ç' => 51,'Ğ' => 54,'E' => 6,'Z' => 24,'o' => 39,'B' => 3,'h' => 32,'i' => 33,'r' => 41,'ğ' => 55…), Any["", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'  …  'Ü', 'ç', 'ö', 'ü', 'Ğ', 'ğ', 'İ', 'ı', 'Ş', 'ş'], 1)), 32, 25, false, 3, Array{Any,1}[[], [], [], [], [], [], [], []], arraybatch)

In [4]:
function readwordset(fname)
    words = []
    fi = open(fname)
    while !eof(fi)
        push!(words, readline(fi))
    end
    close(fi)
    words
end

training_set = readwordset("$datadir/train.tr")
test_set = [ readwordset("$datadir/test.tr"); readwordset("$datadir/dev.tr") ]

74076-element Array{Any,1}:
 "detayları"    
 "GİRİYOR"      
 "Tamamiyle"    
 "haberleşmeye" 
 "yazılarımız"  
 "veresiye"     
 "kumaşlar"     
 "Kolektif"     
 "EPO"          
 "ettirerek"    
 "açıklanmaması"
 "başlıcak"     
 "Huysuz"       
 ⋮              
 "tahdidi"      
 "Sharm"        
 "yavaşlatacak" 
 "Belçikalılar" 
 "Kebabı"       
 "EDERİM"       
 "geçenleri"    
 "Raporlar"     
 "yat"          
 "Dağlaroğlu"   
 "girişiminiz"  
 "vereni"       

In [5]:
struct LModel
    srcembed::Embed
    rnn::RNN        
    projection::Linear  
    dropout::Real
    srccharset::Charset 
end

function LModel(hidden::Int, srcembsz::Int, srccharset::Charset;
             layers=1, dropout=0)
    
    srcembed = Embed(length(srccharset.i2c), srcembsz)
    rnn = RNN(srcembsz, hidden; bidirectional=false, numLayers=layers, dropout=dropout)
    projection = Linear(hidden, length(srccharset.i2c))
    
    LModel(srcembed, rnn, projection, dropout, srccharset)
end

LModel

In [6]:
function (s::LModel)(src, tgt; average=true)
    s.rnn.h, s.rnn.c = 0, 0
    srcembed = s.srcembed(src)
    rnn_out = s.rnn(srcembed)
    dims = size(rnn_out)
    output = s.projection(dropout(reshape(rnn_out, dims[1], dims[2] * dims[3]), s.dropout))
    scores = reshape(output, size(output, 1), dims[2], dims[3])
    nll(scores, mask(tgt, s.srccharset.eow); dims=1, average=average)
end

In [7]:
@info "Testing LModel"
Knet.seed!(1)
model = LModel(128, 128, tr_charset; layers=2, dropout=0.2)
(x, y) = first(dtrn)
@show model(x, y; average=false)
@show loss(model, ddev, average=false)

┌ Info: Testing LModel
└ @ Main In[7]:1


model(x, y; average=false) = (1582.5035f0, 388)
loss(model, ddev, average=false) = (951750.75f0, 233392)


(951750.75f0, 233392)

In [9]:
function train!(model, trn, dev, tst...)
    bestmodel, bestloss = deepcopy(model), loss(model, dev)
    progress!(adam(model, trn), seconds=30) do y
        devloss = loss(model, dev)
        tstloss = map(d->loss(model,d), tst)
        if devloss < bestloss
            bestloss, bestmodel = devloss, deepcopy(model)
        end
        println(stderr)
        (dev=devloss, tst=tstloss, mem=Float32(CuArrays.usage[]))
    end
    return bestmodel
end

train! (generic function with 1 method)

In [11]:
@info "Training"
epochs = 15
ctrn = collect(ddev)
trnx10 = collect(flatten(shuffle!(ctrn) for i in 1:epochs))
trnmini = ctrn[1:20]
dev = collect(dtrn)

model = LModel(128, 128, tr_charset; layers=2, dropout=0.2)
model = train!(model, trnx10, dev, trnmini)

┌ Info: Training
└ @ Main In[11]:1

┣                    ┫ [0.01%, 1/11355, 00:03/10:24:35, 3.30s/i] (dev = 12732.853f0, tst = (81.41691f0,), mem = 4.657472f9)
┣█████████▌          ┫ [47.61%, 5406/11355, 00:37/01:17, 162.35i/s] (dev = 6507.473f0, tst = (40.673782f0,), mem = 5.126007f9)
┣███████████████████ ┫ [95.46%, 10839/11355, 01:10/01:13, 163.18i/s] (dev = 6198.9854f0, tst = (37.643074f0,), mem = 5.126007f9)
┣████████████████████┫ [100.00%, 11355/11355, 01:16/01:16, 149.06i/s] (dev = 6182.557f0, tst = (37.823036f0,), mem = 5.126007f9)


LModel(Embed(P(KnetArray{Float32,2}(128,59))), LSTM(input=128,hidden=128,layers=2,dropout=0.2), Linear(P(KnetArray{Float32,2}(59,128))), 0.2, Charset(Dict{Any,Int64}('ç' => 51,'Ğ' => 54,'E' => 6,'Z' => 24,'o' => 39,'B' => 3,'h' => 32,'i' => 33,'r' => 41,'ğ' => 55…), Any["", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'  …  'Ü', 'ç', 'ö', 'ü', 'Ğ', 'ğ', 'İ', 'ı', 'Ş', 'ş'], 1))

In [12]:
function generate(s::LModel; start="", maxlength=30)
    s.rnn.h, s.rnn.c = 0, 0
    chars = fill(s.srccharset.eow, 1)
    
    starting_index = 1
    for i in 1:length(start)
        push!(chars, s.srccharset.c2i[start[i]])
        charembed = s.srcembed(chars[i:i])
        rnn_out = s.rnn(charembed)
        starting_index += 1
    end
    
    for i in starting_index:maxlength
        charembed = s.srcembed(chars[i:i])
        rnn_out = s.rnn(charembed)
        output = model.projection(dropout(rnn_out, model.dropout))
        push!(chars, s.srccharset.c2i[ sample(s.srccharset.i2c, Weights(Array(softmax(reshape(output, length(s.srccharset.i2c)))))) ] )
        
        if chars[end] == s.srccharset.eow
            break
        end
    end
    
    join([ s.srccharset.i2c[i] for i in chars ], "")
end

generate (generic function with 1 method)

In [13]:
generated_words = [ generate(model) for c in 1:100 ]
intest = [ w for w in generated_words if w in test_set]         
notintraining = [ w for w in generated_words if !(w in training_set)]

println(100 - length(notintraining) , "% of the generated words are words in training set")  
println(length(intest), "% of the generated words are real words that are in the test set")
println("\nExamples of the new generated words:")
println(join(notintraining, "\n"))

8% of the generated words are words in training set
9% of the generated words are real words that are in the test set

Examples of the new generated words:
moje
Podis
aşaşıcıkların
İncele
Biliyoru
önemdir
döküm
aralara
mervenliği
Opurmatısı
rapimlar
Özüylüy
mevanide
Güren
çihkin
hardi
Kayçak
antrü
fan
anlayatimi
San
buzuşlunun
yereleğireceğiş
Gerişiyordu
Özkamatozisi
Mekaf
Emz
haddelerin
parsmuzlarında
elimin
taktıklarda
tacanmanın
gitelerinden
TGT
seğdirde
Lannen
İrtifangına
uyaşi
Yangotrini
özeltilecek
ahrayın
Konuslar
onalarıyla
patlolardır
senimlerini
gösterilmeye
FAPKIKA
yormuştu
yüşülse
Tahverler
midersi
arkelendirdi
pu
kormak
tabatladık
Dib
tadana
değerlendiriyiyeleri
maşın
çaşakıyor
önrüdürde
gerileşici
yapılacı
tehmeyi
nAraj
yerememeyeceğiz
Bulio
vütcü
Yüzyen
Rrestel
OCACLBas
Manasyonlara
Majreaten
adaklarını
çözülmesinde
Celayas
tolonunda
Kanborgas
SIIASI
bankalı
akının
bulanmaması
kayıpça
ininin
Kur
garınmadan
TİZAN
parayetlerin
dosuttur
avuzlarına
abefterini
bıraklacaktan


In [14]:
generated_words = [ generate(model; start="gid") for c in 1:20 ]

20-element Array{String,1}:
 "giderdi"        
 "gidilmeni"      
 "gidenmekten"    
 "giderler"       
 "gideştirilerini"
 "gidebilir"      
 "gidişı"         
 "gidişmeyin"     
 "gidendir"       
 "gidememesi"     
 "gidenleri"      
 "giddeler"       
 "gidişlerle"     
 "gidiyor"        
 "gidişen"        
 "gidersek"       
 "gidişirde"      
 "gidilmesinde"   
 "gidersektir"    
 "gidensilere"    