## Turkish Words Discriminator

In [1]:
using Knet, Test, Base.Iterators, Printf, LinearAlgebra, CuArrays, Random, IterTools, StatsBase

In [2]:
struct Charset
    c2i::Dict{Any,Int}
    i2c::Vector{Any}
    eow::Int
end

function Charset(charset::String; eow="")
    i2c = [ eow; [ c for c in charset ]  ]
    c2i = Dict( c => i for (i, c) in enumerate(i2c))
    return Charset(c2i, i2c, c2i[eow])
end

struct TextReader
    file::String
    charset::Charset
end

function Base.iterate(r::TextReader, s=nothing)
    s === nothing && (s = open(r.file))
    eof(s) && return close(s)
    word, label = split(readline(s))
    return (([ get(r.charset.c2i, c, r.charset.eow) for c in word ], parse(Int, label) + 1), s)
end

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

struct WordsData
    src::TextReader        
    batchsize::Int         
    maxlength::Int         
    batchmajor::Bool       
    bucketwidth::Int    
    buckets::Vector        
    batchmaker::Function   
end

function WordsData(src::TextReader; batchmaker = arraybatch, batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 2, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    WordsData(src, batchsize, maxlength, batchmajor, bucketwidth, buckets, batchmaker)
end

Base.IteratorSize(::Type{WordsData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{WordsData}) = Base.HasEltype()
Base.eltype(::Type{WordsData}) = Tuple{Array{Int64,2},Array{Int64,1}}

function Base.iterate(d::WordsData, state=nothing)
    if state == 0 # When file is finished but buckets are partially full 
        for i in 1:length(d.buckets)
            if length(d.buckets[i]) > 0
                batch = d.batchmaker(d, d.buckets[i])
                d.buckets[i] = []
                return batch, state
            end
        end
        return nothing # Finish iteration
    elseif state === nothing
        # Just to make sure
        for i in 1:length(d.buckets)
            d.buckets[i] = []
        end
        state = nothing
    end

    while true
        src_next = iterate(d.src, state)
        
        if src_next === nothing
            state = 0
            return iterate(d, state)
        end
        
        (src_word, src_state) = src_next
        state = src_state
        src_length = length(src_word[1])
        
        (src_length > d.maxlength) && continue

        i = Int(ceil(src_length / d.bucketwidth))
        i > length(d.buckets) && (i = length(d.buckets))

        push!(d.buckets[i], src_word)
        if length(d.buckets[i]) == d.batchsize
            batch = d.batchmaker(d, d.buckets[i])
            d.buckets[i] = []
            return batch, state
        end
    end
end

function arraybatch(d::WordsData, bucket)
    src_eow = d.src.charset.eow
    
    x = zeros(Int64, length(bucket), d.maxlength) # default d.batchmajor is false
    for (i, v) in enumerate(bucket)
        to_be_added = fill(src_eow, d.maxlength - length(v[1]))
        x[i,:] = [v[1]; to_be_added]
    end
    
    y = [ x[2] for x in bucket]
    
    d.batchmajor && (x = x')
    return (x, y)
end

arraybatch (generic function with 1 method)

In [3]:
char_set = "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÇÖÜçöüĞğİıŞş"
datadir = "discriminator_labeled_set"

BATCHSIZE, MAXLENGTH = 32, 25
@info "Reading data"
tr_charset = Charset(char_set)
tr_train = TextReader("$datadir/dis.train", tr_charset)
tr_dev = TextReader("$datadir/dis.dev", tr_charset)
dtrn = WordsData(tr_train, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 1)
ddev = WordsData(tr_dev, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 1)

┌ Info: Reading data
└ @ Main In[3]:5


WordsData(TextReader("discriminator_labeled_set/dis.dev", Charset(Dict{Any,Int64}('ç' => 51,'Ğ' => 54,'E' => 6,'Z' => 24,'o' => 39,'B' => 3,'h' => 32,'i' => 33,'r' => 41,'ğ' => 55…), Any["", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'  …  'Ü', 'ç', 'ö', 'ü', 'Ğ', 'ğ', 'İ', 'ı', 'Ş', 'ş'], 1)), 32, 25, false, 1, Array{Any,1}[[], [], [], [], [], [], [], [], [], []  …  [], [], [], [], [], [], [], [], [], []], arraybatch)

In [4]:
struct Embed; w; end
Embed(charsetsize::Int, embedsize::Int) = Embed(param(embedsize, charsetsize))
(l::Embed)(x) = (em=permutedims(l.w[:, x], [3, 1, 2]); ds=size(em); em=reshape(em, ds[1], ds[2], 1, ds[3])) # (E, B, T) -> (T, E, 1, B)

struct Conv; w; b; f; p; end
(c::Conv)(x) = (co=conv4(c.w, dropout(x,c.p)); c.f.(pool((co .+ c.b); window=(size(x, 1), size(x, 2)))))
Conv(w1::Int,w2::Int,cx::Int,cy::Int,f=relu;pdrop=0) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f, pdrop)

struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

Dense

In [5]:
# Perform convolution then, global-max pooling and concatinate the output and feed it to sequential dense layer 
mutable struct DisModel
    charset::Charset
    embed::Embed
    filters
    dense_layers
end

function DisModel(charset, embeddingsize, filters, denselayers)
    Em = Embed(length(charset.i2c), embeddingsize)
    Em.w[:, charset.eow] = KnetArray(zeros(embeddingsize))
    DisModel(charset, Em, filters, denselayers)
end

function (c::DisModel)(x)
    em = c.embed(x)
    filters_out = []
    for f in c.filters
        push!(filters_out, f(em))
    end
    out = cat(filters_out...;dims=3)
    for l in c.dense_layers
        out = l(out)
    end
    out
end

(c::DisModel)(x,y; average=true) = nll(c(x), y; average=average)

In [6]:
# per-word loss (in this case per-batch loss)
function loss(model, data; average=true)
    l = 0
    n = 0
    a = 0
    for (x, y) in data
        v = model(x, y; average=false)
        l += v[1]
        n += v[2]
        a += (v[1] / v[2])
    end
    average && return a
    return l, n
end

loss (generic function with 1 method)

In [11]:
Knet.seed!(1)

model = DisModel(tr_charset, 128, (
        Conv(2,128,1,20; pdrop=0.2),
        Conv(3,128,1,20; pdrop=0.2),
        Conv(4,128,1,20; pdrop=0.2),
#         Conv(5,1,1,5; pdrop=0.2)
        ),(
        Dense(60,64,pdrop=0.3),
        Dense(64,2,sigm,pdrop=0.3)
        ))

(x, y) = first(dtrn)
@show model(x)
@show model(x,y; average=false)
@show loss(model, ddev)

model(x) = K32(2,32)[0.5025334⋯]
model(x, y; average=false) = (22.184029f0, 32)
loss(model, ddev) = 1066.8767f0


1066.8767f0

In [12]:
function train!(model, trn, dev, tst...)
    bestmodel, bestloss = deepcopy(model), loss(model, dev)
    progress!(adam(model, trn), seconds=30) do y
        devloss = loss(model, dev)
        tstloss = map(d->loss(model,d), tst)
        if devloss < bestloss
            bestloss, bestmodel = devloss, deepcopy(model)
        end
        println(stderr)
        (dev=devloss, tst=tstloss, mem=Float32(CuArrays.usage[]))
    end
    return bestmodel
end

train! (generic function with 1 method)

In [None]:
@info "Training"
epochs = 10
ctrn = collect(dtrn)
trnx10 = collect(flatten(shuffle!(ctrn) for i in 1:epochs))
trnmini = ctrn[1:20]
dev = collect(ddev)

model = train!(model, trnx10, dev, trnmini)

┌ Info: Training
└ @ Main In[13]:1

┣                    ┫ [0.00%, 1/62290, 00:15/267:44:58, 15.47s/i] (dev = 1066.7972f0, tst = (13.863449f0,), mem = 6.084304f8)
┣▌                   ┫ [2.97%, 1850/62290, 00:47/26:17, 58.95i/s] (dev = 757.9115f0, tst = (10.026481f0,), mem = 6.73748f8)
┣█▏                  ┫ [5.86%, 3651/62290, 01:18/22:15, 57.39i/s] (dev = 734.9005f0, tst = (9.882441f0,), mem = 6.742934f8)
┣█▊                  ┫ [8.87%, 5525/62290, 01:50/20:36, 59.71i/s] (dev = 723.7071f0, tst = (9.744415f0,), mem = 6.7429235f8)
┣██▍                 ┫ [11.93%, 7429/62290, 02:21/19:42, 60.68i/s] (dev = 712.6051f0, tst = (9.568416f0,), mem = 6.808178f8)
┣███                 ┫ [15.00%, 9345/62290, 02:52/19:09, 61.08i/s] (dev = 706.6422f0, tst = (9.471064f0,), mem = 6.8789094f8)
┣███▌                ┫ [18.06%, 11251/62290, 03:24/18:48, 60.75i/s] (dev = 703.01807f0, tst = (9.365469f0,), mem = 6.8825466f8)

In [None]:
results = []
real = []
for (x, y) in dev
    push!(results, map( x-> x[1], argmax(model(x); dims=1))...)
    push!(real, y...)
end

Acc = sum(map( x -> x[1] == x[2], zip(real, results))) / length(real)
println("CNN Discriminator model's accuracy:", Acc)