## Turkish Words Discriminator

In [1]:
using Knet, Test, Base.Iterators, Printf, LinearAlgebra, CuArrays, Random, IterTools, StatsBase

In [2]:
struct Charset
    c2i::Dict{Any,Int}
    i2c::Vector{Any}
    eow::Int
end

function Charset(charset::String; eow="")
    i2c = [ eow; [ c for c in charset ]  ]
    c2i = Dict( c => i for (i, c) in enumerate(i2c))
    return Charset(c2i, i2c, c2i[eow])
end

struct TextReader
    file::String
    charset::Charset
end

function Base.iterate(r::TextReader, s=nothing)
    s === nothing && (s = open(r.file))
    eof(s) && return close(s)
    word, label = split(readline(s))
    return (([ get(r.charset.c2i, c, r.charset.eow) for c in word ], parse(Int, label) + 1), s)
end

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

struct WordsData
    src::TextReader        
    batchsize::Int         
    maxlength::Int         
    batchmajor::Bool       
    bucketwidth::Int    
    buckets::Vector        
    batchmaker::Function   
end

function WordsData(src::TextReader; batchmaker = arraybatch, batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 2, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    WordsData(src, batchsize, maxlength, batchmajor, bucketwidth, buckets, batchmaker)
end

Base.IteratorSize(::Type{WordsData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{WordsData}) = Base.HasEltype()
Base.eltype(::Type{WordsData}) = Tuple{Array{Int64,2},Array{Int64,1}}

function Base.iterate(d::WordsData, state=nothing)
    if state == 0 # When file is finished but buckets are partially full 
        for i in 1:length(d.buckets)
            if length(d.buckets[i]) > 0
                batch = d.batchmaker(d, d.buckets[i])
                d.buckets[i] = []
                return batch, state
            end
        end
        return nothing # Finish iteration
    elseif state === nothing
        # Just to make sure
        for i in 1:length(d.buckets)
            d.buckets[i] = []
        end
        state = nothing
    end

    while true
        src_next = iterate(d.src, state)
        
        if src_next === nothing
            state = 0
            return iterate(d, state)
        end
        
        (src_word, src_state) = src_next
        state = src_state
        src_length = length(src_word[1])
        
        (src_length > d.maxlength) && continue

        i = Int(ceil(src_length / d.bucketwidth))
        i > length(d.buckets) && (i = length(d.buckets))

        push!(d.buckets[i], src_word)
        if length(d.buckets[i]) == d.batchsize
            batch = d.batchmaker(d, d.buckets[i])
            d.buckets[i] = []
            return batch, state
        end
    end
end

function arraybatch(d::WordsData, bucket)
    src_eow = d.src.charset.eow
    
    x = zeros(Int64, length(bucket), d.maxlength) # default d.batchmajor is false
    for (i, v) in enumerate(bucket)
        to_be_added = fill(src_eow, d.maxlength - length(v[1]))
        x[i,:] = [v[1]; to_be_added]
    end
    
    y = [ x[2] for x in bucket]
    
    d.batchmajor && (x = x')
    return (x, y)
end

arraybatch (generic function with 1 method)

In [3]:
char_set = "ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÇÖÜçöüĞğİıŞş"
datadir = "discriminator_labeled_set"

BATCHSIZE, MAXLENGTH = 32, 25
@info "Reading data"
tr_charset = Charset(char_set)
tr_train = TextReader("$datadir/dis.train", tr_charset)
tr_dev = TextReader("$datadir/dis.dev", tr_charset)
dtrn = WordsData(tr_train, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 1)
ddev = WordsData(tr_dev, batchsize=BATCHSIZE, maxlength=MAXLENGTH, bucketwidth = 1)

┌ Info: Reading data
└ @ Main In[3]:5


WordsData(TextReader("discriminator_labeled_set/dis.dev", Charset(Dict{Any,Int64}('ç' => 51,'Ğ' => 54,'E' => 6,'Z' => 24,'o' => 39,'B' => 3,'h' => 32,'i' => 33,'r' => 41,'ğ' => 55…), Any["", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'  …  'Ü', 'ç', 'ö', 'ü', 'Ğ', 'ğ', 'İ', 'ı', 'Ş', 'ş'], 1)), 32, 25, false, 1, Array{Any,1}[[], [], [], [], [], [], [], [], [], []  …  [], [], [], [], [], [], [], [], [], []], arraybatch)

In [4]:
# Define a convolutional layer:
struct Embed; w; end
Embed(charsetsize::Int, embedsize::Int) = Embed(param(embedsize, charsetsize))
(l::Embed)(x) = (em=permutedims(l.w[:, x], [3, 1, 2]); ds=size(em); em=reshape(em, ds[1], ds[2], 1, ds[3])) # (E, B, T) -> (T, E, 1, B)

struct Conv; w; b; f; p; end
(c::Conv)(x) = (co=conv4(c.w, dropout(x,c.p)); c.f.(pool((co .+ c.b); window=(size(x, 1), size(x, 2)))))
Conv(w1::Int,w2::Int,cx::Int,cy::Int,f=relu;pdrop=0) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f, pdrop)

struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

# Let's define a chain of layers
struct Chain
    layers
    Chain(layers...) = new(layers)
end

(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain)(x,y; average=true) = nll(c(x), y; average=average)

# per-word loss (in this case per-batch loss)
function loss(model, data; average=true)
    l = 0
    n = 0
    a = 0
    for (x, y) in data
        v = model(x, y; average=false)
        l += v[1]
        n += v[2]
        a += (v[1] / v[2])
    end
    average && return a
    return l, n
end

loss (generic function with 1 method)

In [5]:
Knet.seed!(1)

EmbeddingSize = 128
model = Chain(
                Embed(length(tr_train.charset.i2c), EmbeddingSize),
                Conv(3,1,1,32; pdrop=0.2),
                Dense(32,2,sigm,pdrop=0.3)
            )

model.layers[1].w[:, tr_train.charset.eow] = KnetArray(zeros(EmbeddingSize))

(x, y) = first(dtrn)
@show model(x)
@show model(x,y; average=false)
@show loss(model, ddev)

model(x) = K32(2,32)[0.499035⋯]
model(x, y; average=false) = (22.180029f0, 32)
loss(model, ddev) = 1066.772f0


1066.772f0

In [6]:
function train!(model, trn, dev, tst...)
    bestmodel, bestloss = deepcopy(model), loss(model, dev)
    progress!(adam(model, trn), seconds=30) do y
        devloss = loss(model, dev)
        tstloss = map(d->loss(model,d), tst)
        if devloss < bestloss
            bestloss, bestmodel = devloss, deepcopy(model)
        end
        println(stderr)
        (dev=devloss, tst=tstloss, mem=Float32(CuArrays.usage[]))
    end
    return bestmodel
end

train! (generic function with 1 method)

In [13]:
@info "Training"

EmbeddingSize = 128
model = Chain(
                Embed(length(tr_train.charset.i2c), EmbeddingSize),
                Conv(5,1,1,50; pdrop=0.2),
                Dense(50,2,sigm,pdrop=0.3)
            )

model.layers[1].w[:, tr_train.charset.eow] = KnetArray(zeros(EmbeddingSize))

epochs = 10
ctrn = collect(dtrn)
trnx10 = collect(flatten(shuffle!(ctrn) for i in 1:epochs))
trnmini = ctrn[1:20]
dev = collect(ddev)

model = train!(model, trnx10, dev, trnmini)

┌ Info: Training
└ @ Main In[13]:1

┣                    ┫ [0.00%, 1/62290, 00:00/06:45:20, 2.56i/s] (dev = 1066.7393f0, tst = (13.862117f0,), mem = 7.9657487f9)
┣█████▋              ┫ [28.30%, 17628/62290, 00:31/01:49, 579.93i/s] (dev = 810.26843f0, tst = (10.370956f0,), mem = 7.9860326f9)
┣███████████▎        ┫ [56.51%, 35201/62290, 01:01/01:48, 578.20i/s] (dev = 810.5525f0, tst = (10.463919f0,), mem = 7.9862666f9)
┣█████████████████   ┫ [85.01%, 52950/62290, 01:32/01:48, 583.94i/s] (dev = 810.8649f0, tst = (10.451368f0,), mem = 7.9862666f9)
┣████████████████████┫ [100.00%, 62290/62290, 01:49/01:49, 570.86i/s] (dev = 809.4863f0, tst = (10.410404f0,), mem = 7.986698f9)


Chain((Embed(P(KnetArray{Float32,2}(128,59))), Conv(P(KnetArray{Float32,4}(5,1,1,10)), P(KnetArray{Float32,4}(1,1,10,1)), NNlib.relu, 0.2), Dense(P(KnetArray{Float32,2}(2,10)), P(KnetArray{Float32,1}(2)), Knet.sigm, 0.3)))

In [14]:
results = []
real = []
for (x, y) in dev
    push!(results, map( x-> x[1], argmax(model(x); dims=1))...)
    push!(real, y...)
end

Acc = sum(map( x -> x[1] == x[2], zip(real, results))) / length(real)
println("CNN Discriminator model's accuracy:", Acc)

CNN Discriminator model's accuracy:0.7840506380973841
