In [15]:
using Knet

include("../src/data.jl")
include("../src/xmodel.jl")
include("../src/train.jl")

In [2]:
datadir = "../data/enwik8"
jld2dir = "../jld2/enwik8.jld2"
BATCHSIZE = 16

if !isfile(jld2dir)
    println("Reading data from directory: $datadir")
    println("Setting batch size to $BATCHSIZE")
    vocab = Vocab("$datadir/train.txt")
    trainfile = TextReader("$datadir/train.txt", vocab)
    validfile = TextReader("$datadir/valid.txt", vocab)
    testfile = TextReader("$datadir/test.txt", vocab)
    dtrn = TextData(trainfile, batchsize=BATCHSIZE)
    ddev = TextData(validfile, batchsize=BATCHSIZE)
    dtst = TextData(testfile, batchsize=BATCHSIZE)
    println("Saving data from $jld2dir")
    Knet.save(jld2dir, "dtrn", dtrn, "dtst", dtst, "ddev", ddev)
else 
    println("Loading data from $jld2dir")
    (dtrn, dtst, ddev) = Knet.load(jld2dir, "dtrn", "dtst", "ddev")
    vocab = dtrn.src.vocab
    if dtrn.batchsize != BATCHSIZE
        changebatchsize!(dtrn, BATCHSIZE)
        changebatchsize!(ddev, BATCHSIZE)
        changebatchsize!(dtst, BATCHSIZE)
    end;
end

Loading data from ../jld2/enwik8.jld2


16

In [16]:
@info "Initializing and Training Language Model"
epochs, em_size, hidden_size, layers = 15, 1024, 1024, 2
println("embedding size: ", em_size)
println("hidden size: ", hidden_size)
println("layers: ", layers)

println("Collecting training data...")
println("epochs: ", epochs)
ctrn = collect(dtrn)
trn = collect(flatten(shuffle!(ctrn) for i in 1:epochs))
trnmini = ctrn[1:20]
dev = collect(ddev);

# model = XModel(em_size, hidden_size, vocab; layers=layers, dropout=0.2)

embedding size: 1024
hidden size: 1024
layers: 2
Collecting training data...
epochs: 15


┌ Info: Initializing and Training Language Model
└ @ Main In[16]:1


306-element Array{Tuple{Array{Int16,2},Array{Int16,2}},1}:
 ([4 10 … 3 8; 19 8 … 12 4; … ; 10 4 … 17 3; 22 3 … 31 12], [10 9 … 8 22; 8 14 … 4 19; … ; 4 3 … 3 53; 3 55 … 12 5])      
 ([22 3 … 15 4; 19 4 … 12 7; … ; 53 18 … 3 11; 5 30 … 11 3], [3 57 … 4 3; 4 9 … 7 21; … ; 18 18 … 11 4; 30 24 … 3 13])    
 ([3 26 … 17 17; 21 9 … 16 5; … ; 4 5 … 4 9; 13 7 … 27 3], [26 6 … 17 3; 9 36 … 5 3; … ; 5 3 … 9 5; 7 21 … 3 18])         
 ([3 26 … 4 9; 3 26 … 11 7; … ; 5 3 … 23 3; 18 18 … 41 41], [26 4 … 9 5; 26 6 … 7 14; … ; 3 26 … 3 6; 18 75 … 41 1])      
 ([5 16 … 19 6; 14 4 … 4 19; … ; 6 9 … 23 3; 1 1 … 5 4], [16 6 … 6 76; 4 28 … 19 12; … ; 9 14 … 3 22; 1 49 … 4 17])       
 ([76 8 … 5 23; 12 6 … 3 5; … ; 22 8 … 26 4; 17 17 … 3 3], [8 10 … 23 32; 6 9 … 5 13; … ; 8 10 … 4 29; 17 40 … 3 3])      
 ([32 39 … 10 8; 13 4 … 9 3; … ; 29 4 … 4 27; 3 3 … 17 28], [39 8 … 8 20; 4 7 … 3 7; … ; 4 10 … 27 3; 3 3 … 28 3])        
 ([20 4 … 10 6; 7 11 … 33 37; … ; 3 42 … 6 11; 3 11 … 16 10], [4 3 … 6 24; 11 3 

In [None]:
@info "Starting training, total iteration no: $(length(trn))"
model.rnn.c, model.rnn.h = 0, 0
# initopt!(model, length(trn); lr=0.001)

model = train!(model, length(ctrn), trn, dev, trnmini)

┌ Info: Starting training, total iteration no: 82410
└ @ Main In[17]:1

┣                    ┫ [0.00%, 1/82410, 00:55/1258:38:06, 54.98s/i] (trn = (loss = (0.8420045f0,), ppl = (2.3210146f0,), bpc = (1.2147556847848042,)), dev = (loss = 0.9809018f0, ppl = 2.66686f0, bpc = 1.41514213035086))
┣█▎                  ┫ [6.67%, 5494/82410, 46:38/11:39:25, 2.00i/s] (trn = (loss = (0.87678576f0,), ppl = (2.403163f0,), bpc = (1.264934460887768,)), dev = (loss = 0.9875315f0, ppl = 2.6845994f0, bpc = 1.4247067734959442))
┣██▋                 ┫ [13.33%, 10988/82410, 01:33:25/11:40:38, 1.96i/s] (trn = (loss = (0.8767683f0,), ppl = (2.403121f0,), bpc = (1.2649092654294165,)), dev = (loss = 0.98394436f0, ppl = 2.6749866f0, bpc = 1.4195316435488314))
┣████                ┫ [20.00%, 16482/82410, 02:19:01/11:35:03, 2.01i/s] (trn = (loss = (0.86140376f0,), ppl = (2.3664804f0,), bpc = (1.2427429375076617,)), dev = (loss = 0.97802615f0, ppl = 2.659202f0, bpc = 1.410993478855422))

In [5]:
@info "Starting training, total iteration no: $(length(trn))"
model.rnn.c, model.rnn.h = 0, 0
# initopt!(model, length(trn); lr=0.001)

model = train!(model, length(ctrn), trn, dev, trnmini)

┌ Info: Starting training, total iteration no: 27470
└ @ Main In[5]:1


15:13:25  ->  Dev set scores : (loss = 0.95302016f0, ppl = 2.5935307f0, bpc = 1.3749174521058594)
15:54:53  ->  5494 iteration: Training set scores : (loss = 0.9501285f0, ppl = 2.586042f0, bpc = 1.3707456689438804)
15:55:38  ->  Dev set scores after 5494 iteration : (loss = 0.99072576f0, ppl = 2.6931884f0, bpc = 1.4293151346171387)
16:38:44  ->  10988 iteration: Training set scores : (loss = 0.94604945f0, ppl = 2.5755148f0, bpc = 1.3648608525879822)
16:39:33  ->  Dev set scores after 10988 iteration : (loss = 0.9717933f0, ppl = 2.6426792f0, bpc = 1.40200136595506)
17:22:34  ->  16482 iteration: Training set scores : (loss = 0.9210054f0, ppl = 2.5118146f0, bpc = 1.3287299633727945)
17:23:22  ->  Dev set scores after 16482 iteration : (loss = 0.9586783f0, ppl = 2.6082468f0, bpc = 1.3830804366464118)
18:00:32  ->  21976 iteration: Training set scores : (loss = 0.89718586f0, ppl = 2.452691f0, bpc = 1.294365593955944)
18:01:22  ->  Dev set scores after 21976 iteration : (loss = 0.9467925f0,

XModel(Embed(P(KnetArray{Float32,2}(1024,206))), LSTM(input=1024,hidden=1024,layers=2,dropout=0.2), Boom(Linear(P(KnetArray{Float32,2}(4096,1024)), P(KnetArray{Float32,1}(4096))), Linear(P(KnetArray{Float32,2}(1024,4096)), P(KnetArray{Float32,1}(1024))), 0.1, false, gelu), Linear(P(KnetArray{Float32,2}(206,1024)), P(KnetArray{Float32,1}(206))), 0.2, Vocab(Dict("54" => 67,"101" => 4,"41" => 52,"65" => 38,"168" => 126,"159" => 175,"228" => 183,"190" => 117,"227" => 96,"88" => 104…), ["<s>", "<unk>", "32", "101", "116", "97", "105", "111", "110", "114"  …  "210", "239", "211", "198", "212", "240", "205", "220", "222", "200"], 2, 1, split))

In [6]:
@info "Finished training, Starting evaluation ..."
trnloss = loss(model, dtrn);
println("Training set scores:       ", report_lm(trnloss))
devloss = loss(model, ddev);
println("Development set scores:    ", report_lm(devloss))
testloss = loss(model, dtst);
println("Test set scores:           ", report_lm(testloss))

# @info "Generate text using the trained model"
# print(generate(model, start="United Nations ", maxlength=1024))

@info "Saving the model as model_x.jld2"
Knet.save("model_x_new.jld2", "model", model);

┌ Info: Finished training, Starting evaluation ...
└ @ Main In[6]:1


Training set scores:       (loss = 0.800062f0, ppl = 2.225679f0, bpc = 1.1542454808878846)
Development set scores:    (loss = 0.9398322f0, ppl = 2.559552f0, bpc = 1.3558912694148832)
Test set scores:           (loss = 0.9556811f0, ppl = 2.600441f0, bpc = 1.3787563628470936)


┌ Info: Saving the model as model_x.jld2
└ @ Main In[6]:12


In [13]:
model = Knet.load("model_x_new.jld2", "model")

XModel(Embed(P(KnetArray{Float32,2}(1024,206))), LSTM(input=1024,hidden=1024,layers=2,dropout=0.2), Boom(Linear(P(KnetArray{Float32,2}(4096,1024)), P(KnetArray{Float32,1}(4096))), Linear(P(KnetArray{Float32,2}(1024,4096)), P(KnetArray{Float32,1}(1024))), 0.1, false, gelu), Linear(P(KnetArray{Float32,2}(206,1024)), P(KnetArray{Float32,1}(206))), 0.2, Vocab(Dict("54" => 67,"101" => 4,"41" => 52,"65" => 38,"168" => 126,"159" => 175,"228" => 183,"190" => 117,"227" => 96,"88" => 104…), ["<s>", "<unk>", "32", "101", "116", "97", "105", "111", "110", "114"  …  "210", "239", "211", "198", "212", "240", "205", "220", "222", "200"], 2, 1, split))

In [14]:
for par in params(model)
    par.opt = nothing
end