In [1]:
using Pkg; haskey(Pkg.installed(),"Knet") || Pkg.add("Knet")
using Statistics: mean
using Base.Iterators: cycle
using Knet: Knet, AutoGrad, Data, param, param0, mat, RNN, dropout, value, nll, adam, minibatch, progress!, converge, Random

In [2]:
# Set display width, load packages, import symbols
ENV["COLUMNS"]=72
using Pkg; for p in ("Knet","Plots"); haskey(Pkg.installed(),p) || Pkg.add(p); end
using Knet: Knet, dir, zeroone, progress, sgd, load, save, gc, Param, KnetArray, gpu, Data, nll, relu, training, dropout # param, param0, xavier
using Statistics: mean
using Base.Iterators: flatten

In [3]:
import LinearAlgebra

In [5]:
atype = Knet.gpu()>=0 ? Knet.KnetArray : Array

Array

In [6]:
function load_data(path)
    xtrain, ytrain = open(path) do f
    xtrain = []
    ytrain = []
    sentence = []
    arcs = []
    count = 1
    for i in enumerate(eachline(f))  
      if i[2] == ""
        push!(xtrain, sentence)
        labels = zeros(count, count)
        for j = 1:count
            if arcs[j] != 0
                labels[j, arcs[j]] = 1
            end
        end
        push!(ytrain, labels)
      elseif i[2][1] != '#'
        temp = split(i[2])
        if temp[1] == "1"
            sentence = []
            arcs = []
            push!(sentence, temp[2])
            push!(arcs, parse(Int64, temp[7]))
            count = 1
        else
            push!(sentence, temp[2]) 
            if isnumeric(temp[7][1])
                push!(arcs, parse(Int64, temp[7]))
            else
                push!(arcs, 0)
            end
            count += 1
        end
      end
    end
    xtrain, ytrain
    end
    xtrain, ytrain
end

load_data (generic function with 1 method)

In [318]:
function load_data2(path)
    xtrain, ytrain = open(path) do f
    xtrain = []
    ytrain = []
    sentence = []
    arcs = []
    count = 1
    for i in enumerate(eachline(f))  
      if i[2] == ""
        push!(xtrain, sentence)
        labels = zeros(count, count)
        push!(ytrain, arcs)
      elseif i[2][1] != '#'
        temp = split(i[2])
        if temp[1] == "1"
            sentence = []
            arcs = []
            push!(sentence, temp[2])
            push!(arcs, parse(Int64, temp[7]))
            count = 1
        else
            push!(sentence, temp[2]) 
            if isnumeric(temp[7][1])
                push!(arcs, parse(Int64, temp[7]))
            else
                push!(arcs, 0)
            end
            count += 1
        end
      end
    end
    xtrain, ytrain
    end
    xtrain, ytrain
end

load_data2 (generic function with 1 method)

In [7]:
function load_embed(path)
    wembed, wembedind = open(path) do f
        wembed = Dict()
        wembedind = []
        for i in enumerate(eachline(f))
            line = i[2]
            tokens = split(line)
            key = tokens[1]
            temp = Array{Float32, 1}()
            for token in tokens[2:end]
                tmp = tryparse(Float32, token)
                append!(temp, tmp)
            end
            wembed[key] = i[1]
            push!(wembedind,temp)
        end
        wembed, wembedind
    end
    wembed, wembedind
end

load_embed (generic function with 1 method)

In [319]:
data2 = load_data2("en-ud-train.conllu")

(Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"], Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."], Any["The", "third", "was", "being", "run", "by", "the", "head", "of", "an", "investment", "firm", "."], Any["You", "wonder", "if", "he", "was", "manipulating", "the", "market", "with", "his", "bombing", "targets", "."], Any["The", "c

In [328]:
data2[2][6]

13-element Array{Any,1}:
  2
  5
  5
  5
  0
  8
  8
  5
 12
 12
 12
  8
  5

In [367]:
wembedind[2]

50-element Array{Float32,1}:
  0.013441
  0.23682 
 -0.16899 
  0.40951 
  0.63812 
  0.47709 
 -0.42852 
 -0.55641 
 -0.364   
 -0.23938 
  0.13001 
 -0.063734
 -0.39575 
  ⋮       
  0.70358 
  0.44858 
 -0.080262
  0.63003 
  0.32111 
 -0.46765 
  0.22786 
  0.36034 
 -0.37818 
 -0.56657 
  0.044691
  0.30392 

In [424]:
[(x,y) for x in 1:3 for y in 1:3]

9-element Array{Tuple{Int64,Int64},1}:
 (1, 1)
 (1, 2)
 (1, 3)
 (2, 1)
 (2, 2)
 (2, 3)
 (3, 1)
 (3, 2)
 (3, 3)

In [447]:
data22 = zip((reshape(x,1,1,length(x)) for x in data2[1]),data2[2])

Base.Iterators.Zip{Tuple{Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))},Array{Any,1}}}((Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))}(getfield(Main, Symbol("##128#129"))(), Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"], Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."], Any["The", "third", "was

In [449]:
iterate(data22)[1][1]

1×1×29 Array{Any,3}:
[:, :, 1] =
 "Al"

[:, :, 2] =
 "-"

[:, :, 3] =
 "Zaman"

...

[:, :, 27] =
 "Syrian"

[:, :, 28] =
 "border"

[:, :, 29] =
 "."

In [421]:
data3 = ((reshape(x,1,1,length(x)) ,y) for x in data2[1] for y in data2[2])

Base.Iterators.Flatten{Base.Generator{Array{Any,1},getfield(Main, Symbol("##103#104"))}}(Base.Generator{Array{Any,1},getfield(Main, Symbol("##103#104"))}(getfield(Main, Symbol("##103#104"))(), Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"], Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."], Any["The", "third", "was", "being", "run"

In [543]:
data4 = ((reshape(cat(rootind,map(getind,x), dims=3),1,length(x)+1),y) for (x,y) in data22)

Base.Generator{Base.Iterators.Zip{Tuple{Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))},Array{Any,1}}},getfield(Main, Symbol("##182#183"))}(getfield(Main, Symbol("##182#183"))(), Base.Iterators.Zip{Tuple{Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))},Array{Any,1}}}((Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))}(getfield(Main, Symbol("##128#129"))(), Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", 

In [458]:
rootind = 399999

399999

In [544]:
iterate(data4)[1][1]

1×30 Array{Int64,2}:
 399999  319  12  34409  46  141  …  47795  2  356  1  3390  719  3

In [423]:
map(getind,iterate(data3)[1][1])

1×1×29 Array{Int64,3}:
[:, :, 1] =
 319

[:, :, 2] =
 12

[:, :, 3] =
 34409

...

[:, :, 27] =
 3390

[:, :, 28] =
 719

[:, :, 29] =
 3

In [545]:
wembedmat[:,iterate(data4)[1][1]]

50×1×30 Array{Float64,3}:
[:, :, 1] =
 -0.7589799761772156  
 -0.47426000237464905 
  0.47369998693466187 
  0.7724999785423279  
 -0.7806400060653687  
  0.23232999444007874 
  0.0461140014231205  
  0.8401399850845337  
  0.243709996342659   
  0.022978000342845917
  0.5396400094032288  
 -0.36100998520851135 
  0.9419800043106079  
  ⋮                   
  0.035413000732660294
  0.5883399844169617  
  0.4543899893760681  
 -0.8425400257110596  
  0.10649999976158142 
 -0.059397000819444656
  0.09044899791479111 
  0.30581000447273254 
 -0.6142399907112122  
  0.7895399928092957  
 -0.014116000384092331
  0.6448000073432922  

[:, :, 2] =
  0.542140007019043  
  1.0302000045776367 
  0.8689600229263306 
  0.5001400113105774 
  0.9518200159072876 
 -1.3366999626159668 
 -0.4010699987411499 
  0.3922699987888336 
  0.536620020866394  
  0.48791998624801636
 -0.8468700051307678 
 -0.6293799877166748 
 -1.3402999639511108 
  ⋮                  
 -0.7340899705886841 
  1.3209999799728394 

In [395]:
getind(":")

46

In [361]:
iterate((x^2 for x in 3:12),iterate(x^2 for x in 3:12)[2])

(16, 4)

In [8]:
wembed, wembedind = load_embed("glove.6B.50d.txt")

(Dict{Any,Any}("newdigate"=>164100,"daufuskie"=>254784,"single-arm"=>192007,"titration"=>117879,"qajar"=>66399,"pinheiro"=>38763,"hospitalet"=>282158,"kennedale"=>223560,"tetracyclic"=>353804,"moher"=>167242…), Any[Float32[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.044457, -0.49688, -0.17862, -0.00066023, -0.6566  …  -0.29871, -0.15749, -0.34758, -0.045637, -0.44251, 0.18785, 0.0027849, -0.18411, -0.11514, -0.78581], Float32[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709, -0.42852, -0.55641, -0.364, -0.23938  …  -0.080262, 0.63003, 0.32111, -0.46765, 0.22786, 0.36034, -0.37818, -0.56657, 0.044691, 0.30392], Float32[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973, -0.43478, -0.31086, -0.44999, -0.29486  …  -6.3681e-5, 0.068987, 0.087939, -0.10285, -0.13931, 0.22314, -0.080803, -0.35652, 0.016413, 0.10216], Float32[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603, 0.18157, -0.52393, 0.10381, -0.17566  …  -0.34727, 0.28483, 0.075693, -0.062178, -0.38988, 0.22902, -0.2

In [9]:
wembedind[get(wembed,"and",-1)[2]]

BoundsError: BoundsError

In [409]:
function getind(word; max=400000, root=false)
    abc = get(wembed,lowercase(word),-1)
    if root == true
        return max-1
    elseif (abc >= 0)
        return abc
    else
        return max
    end
end

getind (generic function with 1 method)

In [368]:
rootemdind = 399999

399999

In [12]:
wembedind[]

BoundsError: BoundsError: attempt to access 400000-element Array{Any,1} at index []

In [330]:
wembed["'"][1]

58

In [331]:
wembedind[58]

50-element Array{Float32,1}:
 -0.039369
  1.2036  
  0.35401 
 -0.55999 
 -0.52078 
 -0.66988 
 -0.75417 
 -0.6534  
 -0.23246 
  0.58686 
 -0.40797 
  1.2057  
 -1.11    
  ⋮       
  0.33207 
  0.020538
 -0.60141 
  0.50403 
 -0.083316
  0.20239 
  0.443   
 -0.060769
 -0.42807 
 -0.084135
  0.49164 
  0.085654

In [14]:
wembedind

400000-element Array{Any,1}:
 Float32[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.044457, -0.49688, -0.17862, -0.00066023, -0.6566  …  -0.29871, -0.15749, -0.34758, -0.045637, -0.44251, 0.18785, 0.0027849, -0.18411, -0.11514, -0.78581]   
 Float32[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709, -0.42852, -0.55641, -0.364, -0.23938  …  -0.080262, 0.63003, 0.32111, -0.46765, 0.22786, 0.36034, -0.37818, -0.56657, 0.044691, 0.30392]          
 Float32[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973, -0.43478, -0.31086, -0.44999, -0.29486  …  -6.3681e-5, 0.068987, 0.087939, -0.10285, -0.13931, 0.22314, -0.080803, -0.35652, 0.016413, 0.10216]    
 Float32[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603, 0.18157, -0.52393, 0.10381, -0.17566  …  -0.34727, 0.28483, 0.075693, -0.062178, -0.38988, 0.22902, -0.21617, -0.22562, -0.093918, -0.80375]        
 Float32[0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246, -0.41376, 0.13228, -0.29847, -0.085253  …  -0.094375, 0.

In [17]:
wembeds5 = load_embed("glove.6B.50d.txt")

(Dict{Any,Any}("newdigate"=>164100,"daufuskie"=>254784,"single-arm"=>192007,"titration"=>117879,"qajar"=>66399,"pinheiro"=>38763,"hospitalet"=>282158,"kennedale"=>223560,"tetracyclic"=>353804,"moher"=>167242…), Any[Float32[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.044457, -0.49688, -0.17862, -0.00066023, -0.6566  …  -0.29871, -0.15749, -0.34758, -0.045637, -0.44251, 0.18785, 0.0027849, -0.18411, -0.11514, -0.78581], Float32[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709, -0.42852, -0.55641, -0.364, -0.23938  …  -0.080262, 0.63003, 0.32111, -0.46765, 0.22786, 0.36034, -0.37818, -0.56657, 0.044691, 0.30392], Float32[0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973, -0.43478, -0.31086, -0.44999, -0.29486  …  -6.3681e-5, 0.068987, 0.087939, -0.10285, -0.13931, 0.22314, -0.080803, -0.35652, 0.016413, 0.10216], Float32[0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603, 0.18157, -0.52393, 0.10381, -0.17566  …  -0.34727, 0.28483, 0.075693, -0.062178, -0.38988, 0.22902, -0.2

In [22]:
dataa = load_data("en-ud-train.conllu")

(Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"], Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."], Any["The", "third", "was", "being", "run", "by", "the", "head", "of", "an", "investment", "firm", "."], Any["You", "wonder", "if", "he", "was", "manipulating", "the", "market", "with", "his", "bombing", "targets", "."], Any["The", "c

In [23]:
datax, datay = dataa

(Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"], Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."], Any["The", "third", "was", "being", "run", "by", "the", "head", "of", "an", "investment", "firm", "."], Any["You", "wonder", "if", "he", "was", "manipulating", "the", "market", "with", "his", "bombing", "targets", "."], Any["The", "c

In [24]:
reshape(datay[7]', 13 * 13)[42:55]

14-element Array{Float64,1}:
 0.0
 0.0
 0.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [25]:
datax

12543-element Array{Any,1}:
 Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."]                                 
 Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"]                                                       
 Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."]                                             
 Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"]                                                                       
 Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."]       

In [26]:
datax[1]

29-element Array{Any,1}:
 "Al"      
 "-"       
 "Zaman"   
 ":"       
 "American"
 "forces"  
 "killed"  
 "Shaikh"  
 "Abdullah"
 "al"      
 "-"       
 "Ani"     
 ","       
 ⋮         
 "mosque"  
 "in"      
 "the"     
 "town"    
 "of"      
 "Qaim"    
 ","       
 "near"    
 "the"     
 "Syrian"  
 "border"  
 "."       

In [27]:
datay

12543-element Array{Any,1}:
 [0.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0]
 [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 [0.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0]
 [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 [0.0 1.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 [0.0 1.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 [0.0 1.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 1.0 … 0.0 0.0]
 [0.0 1.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]
 [0.0 1.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 1.0 … 0.0 0.0]
 [0.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0]
 [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0

In [28]:
newdatay = [reshape(y', size(y)[1] * size(y)[2]) for y in datay]

12543-element Array{Base.ReshapedArray{Float64,1,LinearAlgebra.Adjoint{Float64,Array{Float64,2}},Tuple{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}}},1}:
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0  …  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [332]:
@doc Knet.nll

```
nll(scores, answers; dims=1, average=true)
```

Given an unnormalized `scores` matrix and an `Integer` array of correct `answers`, return the per-instance negative log likelihood. `dims=1` means instances are in columns, `dims=2` means instances are in rows.  Use `average=false` to return the sum instead of per-instance average.

```
nll(model, data; dims=1, average=true, o...)
```

Compute `nll(model(x; o...), y; dims)` for `(x,y)` in `data` and return the per-instance average (if average=true) or total (if average=false) negative log likelihood.


In [30]:
mat1 = reshape(1:30, (5,2,3))

5×2×3 reshape(::UnitRange{Int64}, 5, 2, 3) with eltype Int64:
[:, :, 1] =
 1   6
 2   7
 3   8
 4   9
 5  10

[:, :, 2] =
 11  16
 12  17
 13  18
 14  19
 15  20

[:, :, 3] =
 21  26
 22  27
 23  28
 24  29
 25  30

In [31]:
mat2 = Knet.mat(mat1, dims = 1)

5×6 reshape(::UnitRange{Int64}, 5, 6) with eltype Int64:
 1   6  11  16  21  26
 2   7  12  17  22  27
 3   8  13  18  23  28
 4   9  14  19  24  29
 5  10  15  20  25  30

In [32]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]  # (B,T)->(X,B,T)->rnn->(H,B,T)

In [33]:
wemb = reshape(1:9,3,3)

3×3 reshape(::UnitRange{Int64}, 3, 3) with eltype Int64:
 1  4  7
 2  5  8
 3  6  9

In [411]:
reshape(wembedind, 400000, 50)

DimensionMismatch: DimensionMismatch("new dimensions (400000, 50) must be consistent with array size 400000")

In [198]:
length(wembedind[1])

50

In [212]:
phi(x, d) = [x.^i for i=0:d]

phi (generic function with 1 method)

In [337]:
phi([1;2;3], 3)

3×4 Array{Int64,2}:
 1  1  1   1
 1  2  4   8
 1  3  9  27

In [336]:
phi(x, d) = x.^((0:d)')

phi (generic function with 1 method)

In [335]:
# Minibatch data into (x,y,b) triples. This is the most complicated part of the code:
# for language models x and y contain the same words shifted, x has an EOS in the beginning, y has an EOS at the end
# x,y = [ s11,s21,s31,...,s12,s22,...] i.e. all the first words followed by all the second words etc.
# b = [b1,b2,...,bT] i.e. how many sentences have first words, how many have second words etc.
# length(x)==length(y)==sum(b) and length(b)=length(s1)+1 (+1 because of EOS)
# sentences in batch should be sorted from longest to shortest, i.e. s1 is the longest sentence
function mb(sentences,batchsize)
    sentences = sort(sentences,by=length,rev=true)
    data = []; eos = VOCABSIZE
    for i = 1:batchsize:length(sentences)
        j = min(i+batchsize-1,length(sentences))
        sij = view(sentences,i:j)
        T = 1+length(sij[1])
        x = UInt16[]; y = UInt16[]; b = UInt16[]
        for t=1:T
            bt = 0
            for s in sij
                if t == 1
                    push!(x,eos)
                    push!(y,s[1])
                elseif t <= length(s)
                    push!(x,s[t-1])
                    push!(y,s[t])
                elseif t == 1+length(s)
                    push!(x,s[t-1])
                    push!(y,eos)
                else
                    break
                end
                bt += 1
            end
            push!(b,bt)
        end
        push!(data,(x,y,b))
    end
    return data
end

#mbtrn = mb(trn,BATCHSIZE)
#mbval = mb(val,BATCHSIZE)
#mbtst = mb(tst,BATCHSIZE)
#map(length,(mbtrn,mbval,mbtst))

mb (generic function with 1 method)

In [208]:
wembedmat = zeros(length(wembedind[1]), length(wembedind))
for i=1:length(wembedind)
    wembedmat[:,i] = wembedind[i]
end

In [413]:
wembedmat

50×400000 Array{Float64,2}:
  0.418        0.013441  …  -0.51181   -0.75898    0.072617
  0.24968      0.23682       0.058706  -0.47426   -0.51393 
 -0.41242     -0.16899       1.0913     0.4737     0.4728  
  0.1217       0.40951      -0.55163    0.7725    -0.52202 
  0.34527      0.63812      -0.10249   -0.78064   -0.35534 
 -0.044457     0.47709   …  -0.1265     0.23233    0.34629 
 -0.49688     -0.42852       0.99503    0.046114   0.23211 
 -0.17862     -0.55641       0.079711   0.84014    0.23096 
 -0.00066023  -0.364        -0.16246    0.24371    0.26694 
 -0.6566      -0.23938       0.56488    0.022978   0.41028 
  0.27843      0.13001   …  -0.63306    0.53964    0.28031 
 -0.14767     -0.063734     -0.48592   -0.36101    0.14107 
 -0.55677     -0.39575       0.76247    0.94198   -0.30212 
  ⋮                      ⋱                                 
  0.012041     0.70358      -0.18204    0.035413  -0.83629 
 -0.054223     0.44858       0.041465   0.58834   -0.24698 
 -0.29871   

In [217]:
xt = [1 0 0;
     0 1 0;
     0 0 1]

3×3 Array{Int64,2}:
 1  0  0
 0  1  0
 0  0  1

In [287]:
function softloss(ygold, ypred)
    ynorm = ypred .- log(sum(exp(ypred),1))
    -sum(ygold .* ynorm) / size(ygold,2)
end

softloss (generic function with 1 method)

In [288]:
softloss(xta1,xts1)

DimensionMismatch: DimensionMismatch("matrix is not square: dimensions are (3, 1)")

In [277]:
# For classification we use negative log likelihood loss (aka cross entropy, softmax loss, NLL)
# This is the average -log probability assigned to correct answers by the model
function nlln1(scores, y)
    expscores = exp.(scores)
    probabilities = expscores ./ sum(expscores, dims=1)
    answerprobs = (probabilities[y[i],i] for i in 1:length(y))
    mean(-log.(answerprobs))
end

nlln1 (generic function with 1 method)

In [278]:
expscores = xts1

3×1 Array{Int64,2}:
 1
 0
 0

In [279]:
probabilities = expscores ./ sum(expscores, dims=1)

3×1 Array{Float64,2}:
 1.0
 0.0
 0.0

In [282]:
answerprobs = (probabilities[xta1[i],i] for i in 1:length(xta1))

Base.Generator{UnitRange{Int64},getfield(Main, Symbol("##63#64"))}(getfield(Main, Symbol("##63#64"))(), 1:3)

In [285]:
iterate(answerprobs, iterate(answerprobs)[2])

BoundsError: BoundsError: attempt to access 3×1 Array{Float64,2} at index [0, 2]

In [296]:
xta1 = [1 0; 0 0; 0 0]
xts1 = [1 0; 0 0; 0 0]

3×2 Array{Int64,2}:
 1  0
 0  0
 0  0

In [305]:
xta1 = [1; 0; 0]
xts1 = [1; 0; 0]

3-element Array{Int64,1}:
 1
 0
 0

In [304]:
xta1 = reshape(xta1, 3,1)
xts1 = reshape(xts1, 1,3)

1×3 Array{Int64,2}:
 1  0  0

In [316]:
nll(xts1, [3], dims=1)

1.551444713932051

In [215]:
@doc Knet.nll

```
nll(scores, answers; dims=1, average=true)
```

Given an unnormalized `scores` matrix and an `Integer` array of correct `answers`, return the per-instance negative log likelihood. `dims=1` means instances are in columns, `dims=2` means instances are in rows.  Use `average=false` to return the sum instead of per-instance average.

```
nll(model, data; dims=1, average=true, o...)
```

Compute `nll(model(x; o...), y; dims)` for `(x,y)` in `data` and return the per-instance average (if average=true) or total (if average=false) negative log likelihood.


In [220]:
@doc Knet.softmax

```
softmax(x; dims=1, algo=1)
```

The softmax function typically used in classification. Gives the same results as to `exp.(logp(x, dims))`. 

If `algo=1` computation is more accurate, if `algo=0` it is  faster. 

See also `logsoftmax`.


In [None]:
struct wmat; wm; end


In [209]:
wembedmat

50×400000 Array{Float64,2}:
  0.418        0.013441  …  -0.51181   -0.75898    0.072617
  0.24968      0.23682       0.058706  -0.47426   -0.51393 
 -0.41242     -0.16899       1.0913     0.4737     0.4728  
  0.1217       0.40951      -0.55163    0.7725    -0.52202 
  0.34527      0.63812      -0.10249   -0.78064   -0.35534 
 -0.044457     0.47709   …  -0.1265     0.23233    0.34629 
 -0.49688     -0.42852       0.99503    0.046114   0.23211 
 -0.17862     -0.55641       0.079711   0.84014    0.23096 
 -0.00066023  -0.364        -0.16246    0.24371    0.26694 
 -0.6566      -0.23938       0.56488    0.022978   0.41028 
  0.27843      0.13001   …  -0.63306    0.53964    0.28031 
 -0.14767     -0.063734     -0.48592   -0.36101    0.14107 
 -0.55677     -0.39575       0.76247    0.94198   -0.30212 
  ⋮                      ⋱                                 
  0.012041     0.70358      -0.18204    0.035413  -0.83629 
 -0.054223     0.44858       0.041465   0.58834   -0.24698 
 -0.29871   

In [211]:
wembedind[2]

50-element Array{Float32,1}:
  0.013441
  0.23682 
 -0.16899 
  0.40951 
  0.63812 
  0.47709 
 -0.42852 
 -0.55641 
 -0.364   
 -0.23938 
  0.13001 
 -0.063734
 -0.39575 
  ⋮       
  0.70358 
  0.44858 
 -0.080262
  0.63003 
  0.32111 
 -0.46765 
  0.22786 
  0.36034 
 -0.37818 
 -0.56657 
  0.044691
  0.30392 

In [204]:
x = [1 3;2 4]

2×2 Array{Int64,2}:
 1  3
 2  4

In [206]:
x[:,1] = [51;52]

2-element Array{Int64,1}:
 51
 52

In [195]:
x[1,2] = 55

55

In [207]:
x

2×2 Array{Int64,2}:
 51  3
 52  4

In [36]:
wemb[:,x]

3×2×2 Array{Int64,3}:
[:, :, 1] =
 1  7
 2  8
 3  9

[:, :, 2] =
 4  1
 5  2
 6  3

In [37]:
sa = "ab"
sb = codeunits(sa)

2-element Base.CodeUnits{UInt8,String}:
 0x61
 0x62

In [38]:
for s in sb
    print(s)
    print("\n")
end

97
98


In [39]:
@doc Knet.@diff

Usage:

```
x = Param([1,2,3])          # user declares parameters with `Param`
x => P([1,2,3])             # `Param` is just a struct wrapping a value
value(x) => [1,2,3]         # `value` returns the thing wrapped
sum(x .* x) => 14           # Params act like regular values
y = @diff sum(x .* x)       # Except when we differentiate using `@diff`
y => T(14)                  # you get another struct
value(y) => 14              # which carries the same result
params(y) => [x]            # and the Params that it depends on 
grad(y,x) => [2,4,6]        # and the gradients for all Params
```

`Param(x)` returns a struct that acts like `x` but marks it as a parameter you want to compute gradients with respect to.

`@diff expr` evaluates an expression and returns a struct that contains the result (which should be a scalar) and gradient information.

`grad(y, x)` returns the gradient of `y` (output by @diff) with respect to any parameter `x::Param`, or  `nothing` if the gradient is 0.

`value(x)` returns the value associated with `x` if `x` is a `Param` or the output of `@diff`, otherwise returns `x`.

`params(x)` returns an iterator of Params found by a recursive search of object `x`.

Alternative usage:

```
x = [1 2 3]
f(x) = sum(x .* x)
f(x) => 14
grad(f)(x) => [2 4 6]
gradloss(f)(x) => ([2 4 6], 14)
```

Given a scalar valued function `f`, `grad(f,argnum=1)` returns another function `g` which takes the same inputs as `f` and returns the gradient of the output with respect to the argnum'th argument. `gradloss` is similar except the resulting function also returns f's output.


In [40]:
function tirtembed(str::String)
    strb = codeunits(str)
    sum=0
    for (i, aa) in enumerate(strb)
        sum += 2^(i-1) * aa
    end
    atype = Knet.gpu()>=0 ? Knet.KnetArray : Array
    Random.seed!(sum)
    atype(randn(Float32, 20,1))
end

tirtembed (generic function with 1 method)

In [641]:
struct Dense; w; b; f; end
Dense(i::Int,o::Int,f=identity) = Dense(param(o,i;atype=Array{Float64}), param0(o;atype=Array{Float64}), f)
(d::Dense)(x) = d.f.(d.w * mat(x,dims=1) .+ d.b)

In [642]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input;atype=Array{Float64}), param0(output;atype=Array{Float64}))

(l::Linear)(x) = l.w * mat(x,dims=1) .+ l.b  # (H,B,T)->(H,B*T)->(V,B*T)
(l::Linear)(x,y)= quadl(l(x),y)[1]

In [43]:
@doc LinearAlgebra.axpy!

```
axpy!(a, X, Y)
```

Overwrite `Y` with `a*X + Y`, where `a` is a scalar. Return `Y`.

# Examples

```jldoctest
julia> x = [1; 2; 3];

julia> y = [4; 5; 6];

julia> BLAS.axpy!(2, x, y)
3-element Array{Int64,1}:
  6
  9
 12
```


In [44]:
@doc Knet.axpy!

```
axpy!(a, X, Y)
```

Overwrite `Y` with `a*X + Y`, where `a` is a scalar. Return `Y`.

# Examples

```jldoctest
julia> x = [1; 2; 3];

julia> y = [4; 5; 6];

julia> BLAS.axpy!(2, x, y)
3-element Array{Int64,1}:
  6
  9
 12
```


In [45]:
quadl(yp, yg) = (yp[1]-yg[1]) * (yp[1]-yg[1])
#quadl(yp::AbstractArray, yg::AbstractArray) = (yp[1]-yg[1]) * (yp[1]-yg[1])

quadl (generic function with 1 method)

In [46]:
quadl([5],[2])

9

In [47]:
Knet.params(lin2)[1]

UndefVarError: UndefVarError: lin2 not defined

In [48]:
Knet.params(lin1)[2]

UndefVarError: UndefVarError: lin1 not defined

In [49]:
diff1 = Knet.@diff lin1


Stacktrace:
 [1] [1m(::getfield(Main, Symbol("##10#11")))[22m[1m([22m[1m)[22m at [1mC:\Users\dfhdhsd\.julia\packages\AutoGrad\KsPMr\src\core.jl:197[22m
 [2] [1m#differentiate#3[22m[1m([22m::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::Function[1m)[22m at [1mC:\Users\dfhdhsd\.julia\packages\AutoGrad\KsPMr\src\core.jl:144[22m
 [3] [1mdifferentiate[22m[1m([22m::Function[1m)[22m at [1mC:\Users\dfhdhsd\.julia\packages\AutoGrad\KsPMr\src\core.jl:135[22m
 [4] top-level scope at [1mIn[49]:1[22m
 [5] [1meval[22m at [1m.\boot.jl:328[22m [inlined]
 [6] [1msoftscope_include_string[22m[1m([22m::Module, ::String, ::String[1m)[22m at [1mC:\Users\dfhdhsd\.julia\packages\SoftGlobalScope\cSbw5\src\SoftGlobalScope.jl:218[22m
 [7] [1mexecute_request[22m[1m([22m::ZMQ.Socket, ::IJulia.Msg[1m)[22m at [1mC:\Users\dfhdhsd\.julia\packages\IJulia\9ajf8\src\execute_request.jl:67[22m
 [8] [1m#invokelatest#1[22m at [1m.\essenti

UndefVarError: UndefVarError: lin1 not defined

In [50]:
Knet.params(diff1)

UndefVarError: UndefVarError: diff1 not defined

In [51]:
struct Test1; w1; w2; end
Test1() = Test1(param(1), param(1))
(l::Test1)(x) = begin
    w_1 = l.w1
    w_2 = l.w2
    abc = false
    ab = (l.w1[1] + x[1])
    ba = (l.w2[1] + x[1])
    if ab >= ba
        abc = true
    end
    if abc
            return l.w1 * x[1]
    else
        return return l.w2 * x[1]
    end
end
(l::Test1)(x,y) = quadl(l(x),y)[1]

In [52]:
struct Test2; w; end
Test2() = Test2(param(2))
(l::Test2)(x) = begin
    ab = l.w + x
    abc = false
    if ab[1] >= ab[2]
        abc = true
    end
    if abc
            return l.w[1] * x[1]
    else
        return l.w[2] * x[1]
    end
end
(l::Test2)(x,y) = quadl(l(x),y)[1]

In [53]:
struct Test3; w; end
Test3() = Test3(param(2))
(l::Test3)(x) = begin
    ab = []
    for i = 1:length(x)
        ab = cat(ab, l.w[i] * x[i] ; dims = 1)
    end
    return ab
    #return l.w .* x
end
(l::Test3)(x,y) = quadl(l(x),y)[1]

In [54]:
x1 = [1;1]

2-element Array{Int64,1}:
 1
 1

In [55]:
testm3 = Test3()

Test3(P(Array{Float32,1}(2)))

In [56]:
(Knet.params(testm3)[1])

2-element Param{Array{Float32,1}}:
 0.71690845
 0.09242151

In [57]:
testm3(x1)

2-element Array{Any,1}:
 0.71690845f0
 0.09242151f0

In [58]:
testm3(x1,[-5])

32.68304f0

In [59]:
collect(flatten(Knet.progress(Knet.sgd(testm3,Knet.repeat([([1;1], [-5])],100)))))

9.09e-13  100.00%┣█████████████████████▉┫ 100/100 [00:02/00:02, 65.80i/s]


100-element Array{Float32,1}:
 32.68304     
 20.917149    
 13.386973    
  8.567663    
  5.4833045   
  3.5093148   
  2.2459614   
  1.4374155   
  0.91994554  
  0.58876485  
  0.37680963  
  0.24115798  
  0.15434118  
  ⋮           
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13
  9.094947e-13

In [60]:
lin1 = Linear(2,1)

Linear(P(Array{Float32,2}(1,2)), P(Array{Float32,1}(1)))

In [61]:
lin1([1;1],[3])

2.8994062f0

In [62]:
lin1([1;1])

1×1 Array{Float32,2}:
 1.2972357

In [63]:
lin1([1;1],[3])

2.8994062f0

In [64]:
testm1(x1, [2])

UndefVarError: UndefVarError: testm1 not defined

In [65]:
dtrn = [([1;1], [3])]

1-element Array{Tuple{Array{Int64,1},Array{Int64,1}},1}:
 ([1, 1], [3])

In [66]:
collect(flatten(Knet.progress(Knet.sgd(testm1,Knet.repeat([([1;1], [3])],100)))))

UndefVarError: UndefVarError: testm1 not defined

In [67]:
trainresults("df.jld2",testm1)

UndefVarError: UndefVarError: trainresults not defined

In [68]:
# For running experiments
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)
        r = ((model(dtrn), model(dtst), zeroone(model,dtrn), zeroone(model,dtst))
             for x in takeevery(length(dtrn), progress(sgd(model,repeat(dtrn,100)))))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

trainresults (generic function with 1 method)

In [69]:
@doc flatten

```
flatten(iter)
```

Given an iterator that yields iterators, return an iterator that yields the elements of those iterators. Put differently, the elements of the argument iterator are concatenated.

# Examples

```jldoctest
julia> collect(Iterators.flatten((1:2, 8:9)))
4-element Array{Int64,1}:
 1
 2
 8
 9
```


In [70]:
Knet.params(testm1)[1]

UndefVarError: UndefVarError: testm1 not defined

In [71]:
Knet.params(testm1)[2]

UndefVarError: UndefVarError: testm1 not defined

In [72]:
@doc Knet.sgd

```
minimize(func, data, optimizer=Adam(); params)
sgd     (func, data; lr=0.1,  gclip, params)
momentum(func, data; lr=0.05, gamma=0.95, gclip, params)
nesterov(func, data; lr=0.05, gamma=0.95, gclip, params)
adagrad (func, data; lr=0.05, eps=1e-6, gclip, params)
rmsprop (func, data; lr=0.01, rho=0.9, eps=1e-6, gclip, params)
adadelta(func, data; lr=1.0,  rho=0.9, eps=1e-6, gclip, params)
adam    (func, data; lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, gclip, params)
```

Return an iterator which applies `func` to arguments in `data`, i.e.  `(func(args...) for args in data)`, and updates the parameters every iteration to minimize `func`.  `func` should return a scalar value.

The common keyword argument `params` can be used to list the `Param`s to be optimized.  If not specified, any `Param` that takes part in the computation of `func(args...)` will be updated.

The common keyword argument `gclip` can be used to implement per-parameter gradient clipping. For a parameter gradient `g`, if `norm(g) > gclip > 0`, `g` is scaled so that its norm is equal to `gclip`. If not specified no gradient clipping is performed.

These functions do not perform optimization, but return an iterator that can. Any function that produces values from an iterator can be used with such an object, e.g. `progress!(sgd(f,d))` iterates the sgd optimizer and displays a progress bar. For convenience, appending `!` to the name of the function iterates and returns `nothing`, i.e. `sgd!(...)` is equivalent to `(for x in sgd(...) end)`.

We define optimizers as lazy iterators to have explicit control over them:

  * To report progress use `progress(sgd(f,d))`.
  * To run until convergence use `converge(sgd(f,cycle(d)))`.
  * To run multiple epochs use `sgd(f,repeat(d,n))`.
  * To run a given number of iterations use `sgd(f,take(cycle(d),n))`.
  * To do a task every n iterations use `(task() for (i,j) in enumerate(sgd(f,d)) if i%n == 1)`.

These functions apply the same algorithm with the same configuration to every parameter by default. `minimize` takes an explicit optimizer argument, all others call `minimize` with an appropriate optimizer argument (see `@doc update!` for a list of possible optimizers). Before calling [`update!`](@ref) on a `Param`, `minimize` sets its `opt` field to a copy of this default optimizer if it is not already set. The `opt` field is used by the `update!` function to determine the type of update performed on that parameter.  If you need finer grained control, you can set the optimizer of an individual `Param` by setting its `opt` field before calling one of these functions. They will not override the `opt` field if it is already set, e.g. `sgd(model,data)` will perform an `Adam` update for a parameter whose `opt` field is an `Adam` object. This also means you can stop and start the training without losing optimization state, the first call will set the `opt` fields and the subsequent calls will not override them.

Given a parameter `w` and its gradient `g` here are the updates applied by each optimizer:

```
# sgd (http://en.wikipedia.org/wiki/Stochastic_gradient_descent)
w .= w - lr * g

# momentum (http://jlmelville.github.io/mize/nesterov.html)
v .= gamma * v - lr * g
w .= w + v

# nesterov (http://jlmelville.github.io/mize/nesterov.html)
w .= w - gamma * v
v .= gamma * v - lr * g
w .= w + (1 + gamma) * v

# adagrad (http://www.jmlr.org/papers/v12/duchi11a.html)
G .= G + g .^ 2
w .= w - lr * g ./ sqrt(G + eps)

# rmsprop (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
G .= rho * G + (1-rho) * g .^ 2 
w .= w - lr * g ./ sqrt(G + eps)

# adadelta (http://arxiv.org/abs/1212.5701)
G .= rho * G + (1-rho) * g .^ 2
update = sqrt(delta + eps) .* g ./ sqrt(G + eps)
w = w - lr * update
delta = rho * delta + (1-rho) * update .^ 2

# adam (http://arxiv.org/abs/1412.6980)
v = beta1 * v + (1 - beta1) * g
G = beta2 * G + (1 - beta2) * g .^ 2
vhat = v ./ (1 - beta1 ^ t)
Ghat = G ./ (1 - beta2 ^ t)
w = w - (lr / (sqrt(Ghat) + eps)) * vhat
```


In [73]:
@doc RNN

```
rnn = RNN(inputSize, hiddenSize; opts...)
rnn(x; batchSizes) => y
rnn.h, rnn.c  # hidden and cell states
```

`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers.  Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.

**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.

**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.

**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.

In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.

**Keyword arguments for RNN:**

  * `h=nothing`: Initial hidden state.
  * `c=nothing`: Initial cell state.
  * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.
  * `numLayers=1`: Number of RNN layers.
  * `bidirectional=false`: Create a bidirectional RNN if `true`.
  * `dropout=0`: Dropout probability. Applied to input and between layers.
  * `skipInput=false`: Do not multiply the input with a matrix if `true`.
  * `dataType=Float32`: Data type to use for weights.
  * `algo=0`: Algorithm to use, see CUDNN docs for details.
  * `seed=0`: Random number seed for dropout. Uses `time()` if 0.
  * `winit=xavier`: Weight initialization method for matrices.
  * `binit=zeros`: Weight initialization method for bias vectors.
  * `usegpu=(gpu()>=0)`: GPU used by default if one exists.

**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:

`:relu` and `:tanh`: Single gate RNN with activation function f:

```
h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)
```

`:gru`: Gated recurrent unit:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate
n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate
h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]
```

`:lstm`: Long short term memory unit with no peephole connections:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate
o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate
n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate
c[t] = f[t] .* c[t-1] .+ i[t] .* n[t]               # cell output
h[t] = o[t] .* tanh(c[t])
```


In [74]:
function numinit(dim1)
    return 1:dim1
end

numinit (generic function with 1 method)

In [75]:
Knet.update!(model2.layers[2].w ,model2.layers[2].w)

UndefVarError: UndefVarError: model2 not defined

In [76]:
model2.layers[2].w.value = reshape([100 200 300 400],1,4)

UndefVarError: UndefVarError: model2 not defined

In [77]:
@doc Knet.param

```
param(array; atype)
param(dims...; init, atype)
param0(dims...; atype)
```

The first form returns `Param(atype(array))` where `atype=identity` is the default.

The second form Returns a randomly initialized `Param(atype(init(dims...)))`. By default, `init` is `xavier` and `atype` is `KnetArray{Float32}` if `gpu() >= 0`, `Array{Float32}` otherwise. 

The third form `param0` is an alias for `param(dims...; init=zeros)`.


In [78]:
model2.layers[2].w


UndefVarError: UndefVarError: model2 not defined

In [79]:
rnn2 = RNN(1, 2; bidirectional=true, rnnType = :lstm)
lin2 = Linear(4,1)
mlp2 = MLP(10, 5, 2)
model2 = Chain(rnn2, lin2)

UndefVarError: UndefVarError: MLP not defined

In [80]:
rnn2(reshape([3], 1,1,1))

4×1×1 Array{Float32,3}:
[:, :, 1] =
 0.08195159
 0.22535662
 0.06647501
 0.2062958 

In [81]:
x = []
push!(x, )

0-element Array{Any,1}

In [82]:
fieldnames(RNN)

(:w, :h, :c, :inputSize, :hiddenSize, :numLayers, :dropout, :seed, :inputMode, :direction, :mode, :algo, :dataType, :rnnDesc, :dropoutDesc, :dx, :dhx, :dcx)

In [83]:
size(rx)

UndefVarError: UndefVarError: rx not defined

In [84]:
@doc Knet.xavier

```
xavier(a...)
```

Xavier initialization returns uniform random weights in the range `±sqrt(2 / (fanin + fanout))`.  The `a` arguments are passed to `rand`.  See ([Glorot and Bengio 2010](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)) for a description. [Caffe](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1XavierFiller.html#details) implements this slightly differently. [Lasagne](http://lasagne.readthedocs.org/en/latest/modules/init.html#lasagne.init.GlorotUniform) calls it `GlorotUniform`.


In [85]:
size(Knet.params(rnn2)[1].value)

(1, 1, 80)

In [86]:
Knet.params(rnn2)[1].value = Knet.params(rnn2)[1].value .+ Knet.xavier(size(Knet.params(rnn2)[1].value)...)

1×1×80 Array{Float64,3}:
[:, :, 1] =
 -0.5938544985116351

[:, :, 2] =
 0.6291218518797346

[:, :, 3] =
 -0.7178674044902428

...

[:, :, 78] =
 0.06665347752109804

[:, :, 79] =
 0.018619557828455063

[:, :, 80] =
 0.100441816190206

In [87]:
rx = cat([1 2 3],[4 5 6],dims=3)

1×3×2 Array{Int64,3}:
[:, :, 1] =
 1  2  3

[:, :, 2] =
 4  5  6

In [88]:
rx1 = cat([1 2 3], [4 5 0], dims=3)

1×3×2 Array{Int64,3}:
[:, :, 1] =
 1  2  3

[:, :, 2] =
 4  5  0

In [89]:
rx2 = cat([1 2], [4 5], dims=3)

1×2×2 Array{Int64,3}:
[:, :, 1] =
 1  2

[:, :, 2] =
 4  5

In [90]:
rnn2(rx1)

4×3×2 Array{Float32,3}:
[:, :, 1] =
 0.104257   0.0880124  0.057699 
 0.0258949  0.0779188  0.14792  
 0.193138   0.143937   0.0555855
 0.24144    0.377849   0.372883 

[:, :, 2] =
 0.0416581  0.0233458  0.0569415  
 0.219739   0.307609   0.0497138  
 0.0368182  0.0218744  0.000652089
 0.476165   0.558708   0.0196274  

In [91]:
rx = cat([1 2 3],[4 5 6],dims=3)

1×3×2 Array{Int64,3}:
[:, :, 1] =
 1  2  3

[:, :, 2] =
 4  5  6

In [92]:
rnn2(reshape([3],1,1,1))

4×1×1 Array{Float32,3}:
[:, :, 1] =
 0.05769895
 0.14792015
 0.05549529
 0.36407632

In [93]:
rnn2(reshape([3 0],1,1,2))

4×1×2 Array{Float32,3}:
[:, :, 1] =
 0.05769895 
 0.14792015 
 0.055585492
 0.3728827  

[:, :, 2] =
 0.056941517 
 0.049713776 
 0.0006520893
 0.019627368 

In [94]:
rnn2(rx)

4×3×2 Array{Float32,3}:
[:, :, 1] =
 0.104257   0.0880124  0.057699 
 0.0258949  0.0779188  0.14792  
 0.193138   0.143937   0.0877288
 0.24144    0.377849   0.495215 

[:, :, 2] =
 0.0416581  0.0233458  0.0126705
 0.219739   0.307609   0.391991 
 0.0368182  0.0218744  0.0122218
 0.476165   0.558708   0.616834 

In [95]:
rnn2(rx, batchSizes = [3 3])

ErrorException: Implementation of batchSizes is not completed in CPU

In [623]:
@doc Knet.param

```
param(array; atype)
param(dims...; init, atype)
param0(dims...; atype)
```

The first form returns `Param(atype(array))` where `atype=identity` is the default.

The second form Returns a randomly initialized `Param(atype(init(dims...)))`. By default, `init` is `xavier` and `atype` is `KnetArray{Float32}` if `gpu() >= 0`, `Array{Float32}` otherwise. 

The third form `param0` is an alias for `param(dims...; init=zeros)`.


In [96]:
[x for x in rnn2.w if x<0]

37-element Array{Float32,1}:
 -0.5938545  
 -0.71786743 
 -0.8065058  
 -0.2653079  
 -0.28603992 
 -0.5680328  
 -0.39744723 
 -0.10050218 
 -0.19192412 
 -0.3145375  
 -0.012506866
 -0.28059787 
 -0.6763631  
  ⋮          
 -0.06324435 
 -0.11797717 
 -0.14855406 
 -0.13586521 
 -0.035933476
 -0.07106355 
 -0.13269262 
 -0.012461339
 -0.12050926 
 -0.010981156
 -0.11861693 
 -0.06984177 

In [97]:
rnn2(rx)

4×3×2 Array{Float32,3}:
[:, :, 1] =
 0.104257   0.0880124  0.057699 
 0.0258949  0.0779188  0.14792  
 0.193138   0.143937   0.0877288
 0.24144    0.377849   0.495215 

[:, :, 2] =
 0.0416581  0.0233458  0.0126705
 0.219739   0.307609   0.391991 
 0.0368182  0.0218744  0.0122218
 0.476165   0.558708   0.616834 

In [98]:
@doc RNN

```
rnn = RNN(inputSize, hiddenSize; opts...)
rnn(x; batchSizes) => y
rnn.h, rnn.c  # hidden and cell states
```

`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers.  Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.

**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.

**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.

**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.

In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.

**Keyword arguments for RNN:**

  * `h=nothing`: Initial hidden state.
  * `c=nothing`: Initial cell state.
  * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.
  * `numLayers=1`: Number of RNN layers.
  * `bidirectional=false`: Create a bidirectional RNN if `true`.
  * `dropout=0`: Dropout probability. Applied to input and between layers.
  * `skipInput=false`: Do not multiply the input with a matrix if `true`.
  * `dataType=Float32`: Data type to use for weights.
  * `algo=0`: Algorithm to use, see CUDNN docs for details.
  * `seed=0`: Random number seed for dropout. Uses `time()` if 0.
  * `winit=xavier`: Weight initialization method for matrices.
  * `binit=zeros`: Weight initialization method for bias vectors.
  * `usegpu=(gpu()>=0)`: GPU used by default if one exists.

**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:

`:relu` and `:tanh`: Single gate RNN with activation function f:

```
h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)
```

`:gru`: Gated recurrent unit:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate
n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate
h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]
```

`:lstm`: Long short term memory unit with no peephole connections:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate
o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate
n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate
c[t] = f[t] .* c[t-1] .+ i[t] .* n[t]               # cell output
h[t] = o[t] .* tanh(c[t])
```


In [99]:
@doc LinearAlgebra.axpy!

```
axpy!(a, X, Y)
```

Overwrite `Y` with `a*X + Y`, where `a` is a scalar. Return `Y`.

# Examples

```jldoctest
julia> x = [1; 2; 3];

julia> y = [4; 5; 6];

julia> BLAS.axpy!(2, x, y)
3-element Array{Int64,1}:
  6
  9
 12
```


In [100]:
out = model2(reshape([-1 6 -1 5 1 0], 1,2,3))

UndefVarError: UndefVarError: model2 not defined

In [101]:
reshape(out, 1,2,3)

UndefVarError: UndefVarError: out not defined

In [102]:
model3 = Linear(2,1)

Linear(P(Array{Float32,2}(1,2)), P(Array{Float32,1}(1)))

In [103]:
Knet.params(lin2)[1]

1×4 Param{Array{Float32,2}}:
 0.271739  -0.546023  -0.0261645  -0.560696

In [104]:
asd = Knet.param(5, init= numinit)

5-element Param{Array{Float32,1}}:
 1.0
 2.0
 3.0
 4.0
 5.0

In [105]:
input2 = [2 3]
input2 = cat(input2, [4 5], dims=3)
input2 = cat(input2, [6 7], dims=3)

1×2×3 Array{Int64,3}:
[:, :, 1] =
 2  3

[:, :, 2] =
 4  5

[:, :, 3] =
 6  7

In [106]:
model2(input2)

UndefVarError: UndefVarError: model2 not defined

In [107]:
reshape()

MethodError: MethodError: no method matching reshape()
Closest candidates are:
  reshape(!Matched::Array{T,M}, !Matched::Tuple{Vararg{Int64,N}}) where {T, N, M} at reshapedarray.jl:41
  reshape(!Matched::AbstractArray, !Matched::Int64...) at reshapedarray.jl:115
  reshape(!Matched::AbstractArray, !Matched::Union{Int64, AbstractUnitRange}...) at reshapedarray.jl:110
  ...

In [108]:
@doc Knet.mat

```
mat(x; dims = ndims(x) - 1)
```

Reshape `x` into a two-dimensional matrix by joining the first dims dimensions, i.e.  `reshape(x, prod(size(x,i) for i in 1:dims), :)`

`dims=ndims(x)-1` (default) is typically used when turning the output of a 4-D convolution result into a 2-D input for a fully connected layer.

`dims=1` is typically used when turning the 3-D output of an RNN layer into a 2-D input for a fully connected layer.

`dims=0` will turn the input into a row vector, `dims=ndims(x)` will turn it into a column vector.


In [109]:
@doc prod

```
prod(f, itr)
```

Returns the product of `f` applied to each element of `itr`.

The return type is `Int` for signed integers of less than system word size, and `UInt` for unsigned integers of less than system word size.  For all other arguments, a common return type is found to which all arguments are promoted.

# Examples

```jldoctest
julia> prod(abs2, [2; 3; 4])
576
```

```
prod(itr)
```

Returns the product of all elements of a collection.

The return type is `Int` for signed integers of less than system word size, and `UInt` for unsigned integers of less than system word size.  For all other arguments, a common return type is found to which all arguments are promoted.

# Examples

```jldoctest
julia> prod(1:20)
2432902008176640000
```

```
prod(A::AbstractArray; dims)
```

Multiply elements of an array over the given dimensions.

# Examples

```jldoctest
julia> A = [1 2; 3 4]
2×2 Array{Int64,2}:
 1  2
 3  4

julia> prod(A, dims=1)
1×2 Array{Int64,2}:
 3  8

julia> prod(A, dims=2)
2×1 Array{Int64,2}:
  2
 12
```


In [110]:
a_lin = Linear(3,4)

Linear(P(Array{Float32,2}(4,3)), P(Array{Float32,1}(4)))

In [111]:
Knet.params(a_lin)

2-element Array{Param,1}:
 P(Array{Float32,2}(4,3))
 P(Array{Float32,1}(4))  

In [112]:
struct filtter; end
(f::filtter)(x) = x[1]

In [113]:
@doc Knet.RNN

```
rnn = RNN(inputSize, hiddenSize; opts...)
rnn(x; batchSizes) => y
rnn.h, rnn.c  # hidden and cell states
```

`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers.  Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.

**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.

**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.

**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.

In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.

**Keyword arguments for RNN:**

  * `h=nothing`: Initial hidden state.
  * `c=nothing`: Initial cell state.
  * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.
  * `numLayers=1`: Number of RNN layers.
  * `bidirectional=false`: Create a bidirectional RNN if `true`.
  * `dropout=0`: Dropout probability. Applied to input and between layers.
  * `skipInput=false`: Do not multiply the input with a matrix if `true`.
  * `dataType=Float32`: Data type to use for weights.
  * `algo=0`: Algorithm to use, see CUDNN docs for details.
  * `seed=0`: Random number seed for dropout. Uses `time()` if 0.
  * `winit=xavier`: Weight initialization method for matrices.
  * `binit=zeros`: Weight initialization method for bias vectors.
  * `usegpu=(gpu()>=0)`: GPU used by default if one exists.

**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:

`:relu` and `:tanh`: Single gate RNN with activation function f:

```
h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)
```

`:gru`: Gated recurrent unit:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate
n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate
h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]
```

`:lstm`: Long short term memory unit with no peephole connections:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate
o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate
n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate
c[t] = f[t] .* c[t-1] .+ i[t] .* n[t]               # cell output
h[t] = o[t] .* tanh(c[t])
```


In [114]:
@doc Knet.mat

```
mat(x; dims = ndims(x) - 1)
```

Reshape `x` into a two-dimensional matrix by joining the first dims dimensions, i.e.  `reshape(x, prod(size(x,i) for i in 1:dims), :)`

`dims=ndims(x)-1` (default) is typically used when turning the output of a 4-D convolution result into a 2-D input for a fully connected layer.

`dims=1` is typically used when turning the 3-D output of an RNN layer into a 2-D input for a fully connected layer.

`dims=0` will turn the input into a row vector, `dims=ndims(x)` will turn it into a column vector.


In [115]:
@doc Knet.prod

```
prod(f, itr)
```

Returns the product of `f` applied to each element of `itr`.

The return type is `Int` for signed integers of less than system word size, and `UInt` for unsigned integers of less than system word size.  For all other arguments, a common return type is found to which all arguments are promoted.

# Examples

```jldoctest
julia> prod(abs2, [2; 3; 4])
576
```

```
prod(itr)
```

Returns the product of all elements of a collection.

The return type is `Int` for signed integers of less than system word size, and `UInt` for unsigned integers of less than system word size.  For all other arguments, a common return type is found to which all arguments are promoted.

# Examples

```jldoctest
julia> prod(1:20)
2432902008176640000
```

```
prod(A::AbstractArray; dims)
```

Multiply elements of an array over the given dimensions.

# Examples

```jldoctest
julia> A = [1 2; 3 4]
2×2 Array{Int64,2}:
 1  2
 3  4

julia> prod(A, dims=1)
1×2 Array{Int64,2}:
 3  8

julia> prod(A, dims=2)
2×1 Array{Int64,2}:
  2
 12
```


In [728]:
struct ABC; a; end;
(ab::ABC)(x) = 3
(ab::ABC)(x,y) = 4
(ab::ABC)(x;dat=:hop) = 5

In [730]:
ab1 = ABC(7)

ABC(7)

In [731]:
ab1(3)

5

In [733]:
struct Chain
    layers
    Chain(layers...) = new(layers)
end
(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain)(x,y) = nll(c(x),y)
(c::Chain)(d::Data) = mean(c(x,y) for (x,y) in d)
(c::Chain)(d,a,b) = mean(c(x,y) for (x,y) in d)

In [117]:
#struct MLP; w1; b1; f1; w2; b2; f2; end
#MLP()
#(m::MLP)(x) = Chain(Dense(m.w1, m.b1, m.f1), Dense(m.w1, m.b1, m.f2)

In [118]:
MLP(i::Int, h::Int, o::Int) = Chain(Dense(i,h, Knet.relu), Dense(h,o,Knet.relu))

MLP (generic function with 1 method)

In [119]:
ab1 = Knet.param([3 2; 1 2])

2×2 Param{Array{Int64,2}}:
 3  2
 1  2

In [120]:
mlp_ex1 = MLP(3,2,2)

Chain((Dense(P(Array{Float32,2}(2,3)), P(Array{Float32,1}(2)), Knet.relu), Dense(P(Array{Float32,2}(2,2)), P(Array{Float32,1}(2)), Knet.relu)))

In [121]:
lin_ex1 = Linear(1,2)

Linear(P(Array{Float32,2}(2,1)), P(Array{Float32,1}(2)))

In [122]:
Knet.params(lin_ex1)[1]

2×1 Param{Array{Float32,2}}:
 0.48963422
 0.5708092 

In [123]:
lin_ex1([300])

2×1 Array{Float32,2}:
 146.89026
 171.24275

In [124]:
abc = zeros(2,1,2,2,1)

2×1×2×2×1 Array{Float64,5}:
[:, :, 1, 1, 1] =
 0.0
 0.0

[:, :, 2, 1, 1] =
 0.0
 0.0

[:, :, 1, 2, 1] =
 0.0
 0.0

[:, :, 2, 2, 1] =
 0.0
 0.0

In [125]:
dimlen(x) = length(size(x)); #ndims does this nevermind

In [126]:
ndims(abc)

5

In [127]:
matmul(x,y) = reshapex*y

matmul (generic function with 1 method)

In [128]:
matmatmul(a,b,dim1, dim2)

UndefVarError: UndefVarError: matmatmul not defined

In [129]:
a = reshape(Vector(1:8),(2,2,2))

2×2×2 Array{Int64,3}:
[:, :, 1] =
 1  3
 2  4

[:, :, 2] =
 5  7
 6  8

In [130]:
b = reshape(Vector(1:8),(2,2,2))

2×2×2 Array{Int64,3}:
[:, :, 1] =
 1  3
 2  4

[:, :, 2] =
 5  7
 6  8

In [131]:
mapslices(sum, (a,b), dims = [3])

MethodError: MethodError: no method matching mapslices(::typeof(sum), ::Tuple{Array{Int64,3},Array{Int64,3}}; dims=[3])
Closest candidates are:
  mapslices(::Any, !Matched::AbstractArray; dims) at abstractarray.jl:1919

In [132]:
mapslices(typeof, a, dims = [1])

1×2×2 Array{DataType,3}:
[:, :, 1] =
 Array{Int64,1}  Array{Int64,1}

[:, :, 2] =
 Array{Int64,1}  Array{Int64,1}

In [133]:
A = reshape(Vector(1:8), (2,2,2))

2×2×2 Array{Int64,3}:
[:, :, 1] =
 1  3
 2  4

[:, :, 2] =
 5  7
 6  8

In [134]:
permutedims(A, [3, 1, 2])

2×2×2 Array{Int64,3}:
[:, :, 1] =
 1  2
 5  6

[:, :, 2] =
 3  4
 7  8

In [135]:
S[1]

UndefVarError: UndefVarError: S not defined

In [136]:
S = []

0-element Array{Any,1}

In [137]:
push!(S, A)

1-element Array{Any,1}:
 [1 3; 2 4]

[5 7; 6 8]

In [138]:
@doc RNN

```
rnn = RNN(inputSize, hiddenSize; opts...)
rnn(x; batchSizes) => y
rnn.h, rnn.c  # hidden and cell states
```

`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers.  Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.

**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.

**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.

**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.

In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.

**Keyword arguments for RNN:**

  * `h=nothing`: Initial hidden state.
  * `c=nothing`: Initial cell state.
  * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.
  * `numLayers=1`: Number of RNN layers.
  * `bidirectional=false`: Create a bidirectional RNN if `true`.
  * `dropout=0`: Dropout probability. Applied to input and between layers.
  * `skipInput=false`: Do not multiply the input with a matrix if `true`.
  * `dataType=Float32`: Data type to use for weights.
  * `algo=0`: Algorithm to use, see CUDNN docs for details.
  * `seed=0`: Random number seed for dropout. Uses `time()` if 0.
  * `winit=xavier`: Weight initialization method for matrices.
  * `binit=zeros`: Weight initialization method for bias vectors.
  * `usegpu=(gpu()>=0)`: GPU used by default if one exists.

**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:

`:relu` and `:tanh`: Single gate RNN with activation function f:

```
h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)
```

`:gru`: Gated recurrent unit:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate
n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate
h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]
```

`:lstm`: Long short term memory unit with no peephole connections:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate
o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate
n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate
c[t] = f[t] .* c[t-1] .+ i[t] .* n[t]               # cell output
h[t] = o[t] .* tanh(c[t])
```


In [139]:
struct BatchIter; mat; end;
function Base.iterate(bmat::BatchIter, s...)
    s === nothing && return bmat.mat[]
    
end

In [633]:
struct Biaff; mlphead; mlpdep; w; b; end;
#Biaff(mlph::MLP, mlpd::MLP, input::Int, output::Int)=Biaff(mlph, mlpd , param(output,input), param0(output))
Biaff(rhid::Int, mlph::Int, m::Int, input::Int, output::Int) =
    Biaff(MLP(rhid,mlph, m) , MLP(rhid,mlph, m) , param(m,m;atype=Array{Float64}), param0(m;atype=Array{Float64}))

(bi::Biaff)(x) = begin
    xp = permutedims(x, [3, 1, 2])
    hidsize = size(x)[1]
    T = size(x)[ndims(x)]
    #x = reshape(x, hidsize, T)
    archead = bi.mlphead(x)
    arcdep = bi.mlpdep(x[:,:,2:end])
    archeadp = permutedims(reshape(archead, size(archead,1), size(x,2), size(x,3)), [3, 1, 2])
    arcdepp = permutedims(reshape(arcdep, size(arcdep,1), size(x,2), size(x,3)-1), [1, 3, 2])
    HW = batchmult(archeadp,bi.w)
    HWH = batchmult(HW, arcdepp)
    Hb = batchmult(archeadp,bi.b)
    S = HWH .+ Hb    
end

In [629]:
@doc param

```
param(array; atype)
param(dims...; init, atype)
param0(dims...; atype)
```

The first form returns `Param(atype(array))` where `atype=identity` is the default.

The second form Returns a randomly initialized `Param(atype(init(dims...)))`. By default, `init` is `xavier` and `atype` is `KnetArray{Float32}` if `gpu() >= 0`, `Array{Float32}` otherwise. 

The third form `param0` is an alias for `param(dims...; init=zeros)`.


In [578]:
print(s1)

[1 1 0; 0 0 0; 0 0 1]

In [511]:
s1 = [1 1 0;
      0 0 0;
      0 0 1]

3×3 Array{Int64,2}:
 1  1  0
 0  0  0
 0  0  1

In [527]:
a1 = [1; 2; 1]

3-element Array{Int64,1}:
 1
 2
 1

In [528]:
Knet.nll(s1,a1)

1.2181113805987176

In [141]:
@doc map

```
map(f, c...) -> collection
```

Transform collection `c` by applying `f` to each element. For multiple collection arguments, apply `f` elementwise.

See also: [`mapslices`](@ref)

# Examples

```jldoctest
julia> map(x -> x * 2, [1, 2, 3])
3-element Array{Int64,1}:
 2
 4
 6

julia> map(+, [1, 2, 3], [10, 20, 30])
3-element Array{Int64,1}:
 11
 22
 33
```


In [508]:
array[3,2,2]

15

In [142]:
array = reshape(1:27,3,3,3)

3×3×3 reshape(::UnitRange{Int64}, 3, 3, 3) with eltype Int64:
[:, :, 1] =
 1  4  7
 2  5  8
 3  6  9

[:, :, 2] =
 10  13  16
 11  14  17
 12  15  18

[:, :, 3] =
 19  22  25
 20  23  26
 21  24  27

In [143]:
array2 = reshape(1:9,3,3)

3×3 reshape(::UnitRange{Int64}, 3, 3) with eltype Int64:
 1  4  7
 2  5  8
 3  6  9

In [144]:
@doc view

```
view(A, inds...)
```

Like [`getindex`](@ref), but returns a view into the parent array `A` with the given indices instead of making a copy.  Calling [`getindex`](@ref) or [`setindex!`](@ref) on the returned `SubArray` computes the indices to the parent array on the fly without checking bounds.

# Examples

```jldoctest
julia> A = [1 2; 3 4]
2×2 Array{Int64,2}:
 1  2
 3  4

julia> b = view(A, :, 1)
2-element view(::Array{Int64,2}, :, 1) with eltype Int64:
 1
 3

julia> fill!(b, 0)
2-element view(::Array{Int64,2}, :, 1) with eltype Int64:
 0
 0

julia> A # Note A has changed even though we modified b
2×2 Array{Int64,2}:
 0  2
 0  4
```


In [145]:
wmat = [2 2 2; 2 2 2; 2 2 2]

3×3 Array{Int64,2}:
 2  2  2
 2  2  2
 2  2  2

In [146]:
array2 * wmat

3×3 Array{Int64,2}:
 24  24  24
 30  30  30
 36  36  36

In [147]:
(array2*wmat for i in array2[:,:, i])

UndefVarError: UndefVarError: i not defined

In [148]:
x = [1 2 3]
y = [4 5 6]

1×3 Array{Int64,2}:
 4  5  6

In [149]:
a = []
push!(a,x)
push!(a,y)

2-element Array{Any,1}:
 [1 2 3]
 [4 5 6]

In [150]:
cat(a, dims=2)

2×1 Array{Any,2}:
 [1 2 3]
 [4 5 6]

In [151]:
[wmat*array[:,:,i] for i in 1:size(array,3)]

3-element Array{Array{Int64,2},1}:
 [12 30 48; 12 30 48; 12 30 48]         
 [66 84 102; 66 84 102; 66 84 102]      
 [120 138 156; 120 138 156; 120 138 156]

In [152]:
cat([wmat*array[:,:,i] for i in 1:size(array,3)]..., dims=3)

3×3×3 Array{Int64,3}:
[:, :, 1] =
 12  30  48
 12  30  48
 12  30  48

[:, :, 2] =
 66  84  102
 66  84  102
 66  84  102

[:, :, 3] =
 120  138  156
 120  138  156
 120  138  156

In [499]:
function batchmult(w,x;dim=3)
    xb = ndims(x)>=3
    wb = ndims(w)>=3
    if(wb && xb)
        return cat(collect([w[:,:,i]*x[:,:,i] for i in 1:size(x,dim)])..., dims=dim)
    elseif(wb)
        return cat(collect([w[:,:,i]*x for i in 1:size(x,dim)])..., dims=dim)
    elseif(xb)
        return cat(collect([w*x[:,:,i] for i in 1:size(x,dim)])..., dims=dim)
    else
        return wb*xb
    end
end

batchmult (generic function with 3 methods)

In [495]:
ndims(wmat)

2

In [502]:
batchmult(array,array)

3×3×3 Array{Int64,3}:
[:, :, 1] =
 30  66  102
 36  81  126
 42  96  150

[:, :, 2] =
 435  552  669
 468  594  720
 501  636  771

[:, :, 3] =
 1326  1524  1722
 1386  1593  1800
 1446  1662  1878

In [481]:
wmat.*array[:,:,1:size(array,3)]

3×3×3 Array{Int64,3}:
[:, :, 1] =
 2   8  14
 4  10  16
 6  12  18

[:, :, 2] =
 20  26  32
 22  28  34
 24  30  36

[:, :, 3] =
 38  44  50
 40  46  52
 42  48  54

In [477]:
[w*x[:,:,i] for x in array, i in 1:size(x,3)]

MethodError: MethodError: no method matching getindex(::Int64, ::Colon, ::Colon, ::Int64)
Closest candidates are:
  getindex(::Number, !Matched::Integer...) at number.jl:82
  getindex(::Number) at number.jl:75
  getindex(::Number, !Matched::Integer) at number.jl:77
  ...

In [482]:
batchmult(wmat,array)

MethodError: MethodError: no method matching *(::Array{Int64,3}, ::Array{Int64,3})
Closest candidates are:
  *(::Any, ::Any, !Matched::Any, !Matched::Any...) at operators.jl:502
  *(!Matched::Number, ::AbstractArray) at arraymath.jl:52
  *(::AbstractArray, !Matched::Number) at arraymath.jl:55
  ...

In [474]:
wmat

3×3 Array{Int64,2}:
 2  2  2
 2  2  2
 2  2  2

In [475]:
array

3×3×3 reshape(::UnitRange{Int64}, 3, 3, 3) with eltype Int64:
[:, :, 1] =
 1  4  7
 2  5  8
 3  6  9

[:, :, 2] =
 10  13  16
 11  14  17
 12  15  18

[:, :, 3] =
 19  22  25
 20  23  26
 21  24  27

In [155]:
size(array,3)

3

In [156]:
asdres = map((x,y)->(x[1]*x[2]), (array2,wmat))

MethodError: MethodError: no method matching (::getfield(Main, Symbol("##35#36")))(::Base.ReshapedArray{Int64,2,UnitRange{Int64},Tuple{}})
Closest candidates are:
  #35(::Any, !Matched::Any) at In[156]:1

In [157]:
@doc LinearAlgebra.eye

No documentation found.

Binding `LinearAlgebra.eye` does not exist.


In [158]:
a = zeros(3,2)
b = zeros(2,5)
a*b

3×5 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0

In [159]:
a,s = iterate(ita2)

UndefVarError: UndefVarError: ita2 not defined

In [160]:
@doc Knet.nll

```
nll(scores, answers; dims=1, average=true)
```

Given an unnormalized `scores` matrix and an `Integer` array of correct `answers`, return the per-instance negative log likelihood. `dims=1` means instances are in columns, `dims=2` means instances are in rows.  Use `average=false` to return the sum instead of per-instance average.

```
nll(model, data; dims=1, average=true, o...)
```

Compute `nll(model(x; o...), y; dims)` for `(x,y)` in `data` and return the per-instance average (if average=true) or total (if average=false) negative log likelihood.


In [161]:
struct itarray
    a
end
function Base.iterate(it::itarray, s...)
    if s > length(it)
        return nothing
    else
        return it.a[s]
    end
end
Base.length(it::itarray) = length(it.a)

In [162]:
ita2 = (x for x in array)

Base.Generator{Base.ReshapedArray{Int64,3,UnitRange{Int64},Tuple{}},getfield(Main, Symbol("##37#38"))}(getfield(Main, Symbol("##37#38"))(), [1 4 7; 2 5 8; 3 6 9]

[10 13 16; 11 14 17; 12 15 18]

[19 22 25; 20 23 26; 21 24 27])

In [163]:
a2 = [x for x in array]

3×3×3 Array{Int64,3}:
[:, :, 1] =
 1  4  7
 2  5  8
 3  6  9

[:, :, 2] =
 10  13  16
 11  14  17
 12  15  18

[:, :, 3] =
 19  22  25
 20  23  26
 21  24  27

In [164]:
datxit = (x for x in datax)

Base.Generator{Array{Any,1},getfield(Main, Symbol("##41#42"))}(getfield(Main, Symbol("##41#42"))(), Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "authorities", "announced", "that", "they", "had", "busted", "up", "3", "terrorist", "cells", "operating", "in", "Baghdad", "."], Any["Two", "of", "them", "were", "being", "run", "by", "2", "officials", "of", "the", "Ministry", "of", "the", "Interior", "!"], Any["The", "MoI", "in", "Iraq", "is", "equivalent", "to", "the", "US", "FBI"  …  "members", "of", "the", "Weathermen", "bombers", "back", "in", "the", "1960s", "."], Any["The", "third", "was", "being", "run", "by", "the", "head", "of", "an", "investment", "firm", "."], Any["You", "wonder", "if", "he

In [599]:
@doc Knet.Data

```
minibatch(x, [y], batchsize; shuffle, partial, xtype, ytype, xsize, ysize)
```

Return an iterator of minibatches [(xi,yi)...] given data tensors x, y and batchsize.  

The last dimension of x and y give the number of instances and should be equal. `y` is optional, if omitted a sequence of `xi` will be generated rather than `(xi,yi)` tuples.  Use `repeat(d,n)` for multiple epochs, `Iterators.take(d,n)` for a partial epoch, and `Iterators.cycle(d)` to cycle through the data forever (this can be used with `converge`). If you need the iterator to continue from its last position when stopped early (e.g. by a break in a for loop), use `Iterators.Stateful(d)` (by default the iterator would restart from the beginning).

Keyword arguments:

  * `shuffle=false`: Shuffle the instances every epoch.
  * `partial=false`: If true include the last partial minibatch < batchsize.
  * `xtype=typeof(x)`: Convert xi in minibatches to this type.
  * `ytype=typeof(y)`: Convert yi in minibatches to this type.
  * `xsize=size(x)`: Convert xi in minibatches to this shape.
  * `ysize=size(y)`: Convert yi in minibatches to this shape.


In [601]:
@doc Knet.repeat

```
repeat(A::AbstractArray, counts::Integer...)
```

Construct an array by repeating array `A` a given number of times in each dimension, specified by `counts`.

# Examples

```jldoctest
julia> repeat([1, 2, 3], 2)
6-element Array{Int64,1}:
 1
 2
 3
 1
 2
 3

julia> repeat([1, 2, 3], 2, 3)
6×3 Array{Int64,2}:
 1  1  1
 2  2  2
 3  3  3
 1  1  1
 2  2  2
 3  3  3
```

```
repeat(A::AbstractArray; inner=ntuple(x->1, ndims(A)), outer=ntuple(x->1, ndims(A)))
```

Construct an array by repeating the entries of `A`. The i-th element of `inner` specifies the number of times that the individual entries of the i-th dimension of `A` should be repeated. The i-th element of `outer` specifies the number of times that a slice along the i-th dimension of `A` should be repeated. If `inner` or `outer` are omitted, no repetition is performed.

# Examples

```jldoctest
julia> repeat(1:2, inner=2)
4-element Array{Int64,1}:
 1
 1
 2
 2

julia> repeat(1:2, outer=2)
4-element Array{Int64,1}:
 1
 2
 1
 2

julia> repeat([1 2; 3 4], inner=(2, 1), outer=(1, 3))
4×6 Array{Int64,2}:
 1  2  1  2  1  2
 1  2  1  2  1  2
 3  4  3  4  3  4
 3  4  3  4  3  4
```

```
repeat(s::AbstractString, r::Integer)
```

Repeat a string `r` times. This can be written as `s^r`.

See also: [`^`](@ref)

# Examples

```jldoctest
julia> repeat("ha", 3)
"hahaha"
```

```
repeat(c::AbstractChar, r::Integer) -> String
```

Repeat a character `r` times. This can equivalently be accomplished by calling [`c^r`](@ref ^).

# Examples

```jldoctest
julia> repeat('A', 3)
"AAA"
```


In [677]:
clcdata5 = collect(data5)

12543-element Array{Tuple{Array{Float64,3},Array{Int64,1}},1}:
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[0.54214; 1.0302; … ; 0.18317; 0.4338]

[-0.16768; 1.2151; … ; 0.30893; 0.11023]

...

[0.12417; 1.0659; … ; -0.41358; -1.4449]

[0.38784; -0.36196; … ; -0.41707; -1.1415]

[0.15164; 0.30177; … ; 0.016413; 0.10216], [1, 2, 2, 2, 7, 8, 2, 8, 9, 9  …  22, 19, 24, 22, 22, 29, 29, 29, 22, 2])               
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[-0.61201; 0.98226; … ; 0.30649; 0.81745]

[0.53074; 0.40117; … ; 0.1444; 0.23611]

...

[0.5267; 0.11441; … ; -0.2983; 0.020482]

[0.15164; 0.30177; … ; 0.016413; 0.10216]

[-0.6297; 0.69044; … ; 0.25744; 1.0729], [11, 4, 11, 8, 8, 8, 4, 11, 11, 1, 11, 11, 15, 11, 17, 15, 11, 11])                      
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[-0.15362; -0.49874; … ; 0.056634; -0.47371]

[-0.17587; 1.3508; … ; 0.5665; 0.61385]

...

[0.33042; 0.24995; … ; -0.027273; -0.53285]

[0.83281; -0.22828; … ; -0.41874; -0.25097]

[0.1

In [680]:
length(clcdata5)

12543

In [682]:
dtrn = clcdata5[1:8000]
dtst = clcdata5[8001:12000]

4000-element Array{Tuple{Array{Float64,3},Array{Int64,1}},1}:
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[0.68491; 0.32385; … ; -0.1693; 0.062375]

[0.96193; 0.012516; … ; -0.38468; -0.38712]

...

[0.26818; 0.14346; … ; -0.25028; -0.38097]

[0.43433; 0.72169; … ; 0.14362; -0.93825]

[0.013441; 0.23682; … ; 0.044691; 0.30392], [3, 1, 6, 6, 3, 6, 11, 11, 11, 6  …  17, 17, 13, 19, 13, 21, 19, 23, 19, 3])                                 
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[0.16788; 0.10003; … ; 0.57603; 0.039515]

[-0.25472; 0.13607; … ; 0.25924; 0.68466]

...

[0.013441; 0.23682; … ; 0.044691; 0.30392]

[-0.32005; 0.1444; … ; -0.33944; -0.089125]

[0.3205; -0.26331; … ; 0.084629; -0.69395], [1, 4, 2, 7, 7, 4, 10, 10, 2, 14  …  45, 37, 48, 48, 37, 50, 37, 53, 53, 37])                                
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[-0.61554; 0.47275; … ; -0.19855; -0.39879]

[0.8052; 0.37121; … ; -0.20902; 0.47612]

...

[1.2487; 0.087988; … ; -0.56704; -0.1

In [732]:
dtrn = convert(Knet.Data, dtrn)

MethodError: MethodError: Cannot `convert` an object of type Array{Tuple{Array{Float64,3},Array{Int64,1}},1} to an object of type Data
Closest candidates are:
  convert(::Type{T}, !Matched::T) where T at essentials.jl:154

In [615]:
[iterate(data5)[1]]

1-element Array{Tuple{Array{Float64,3},Array{Int64,1}},1}:
 ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[0.54214; 1.0302; … ; 0.18317; 0.4338]

[-0.16768; 1.2151; … ; 0.30893; 0.11023]

...

[0.12417; 1.0659; … ; -0.41358; -1.4449]

[0.38784; -0.36196; … ; -0.41707; -1.1415]

[0.15164; 0.30177; … ; 0.016413; 0.10216], [1, 2, 2, 2, 7, 8, 2, 8, 9, 9  …  22, 19, 24, 22, 22, 29, 29, 29, 22, 2])

In [683]:
# For running experiments
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)
        r = ((model(dtrn), model(dtst), zeroone(model,dtrn), zeroone(model,dtst))
             for x in takeevery(length(dtrn), progress(sgd(model,repeat(dtrn,2)))))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

trainresults (generic function with 1 method)

In [735]:
# For running experiments
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)
        r = ((model(dtrn,1,1), model(dtst,1,1), zeroone(model,dtrn), zeroone(model,dtst))
             for x in takeevery(length(dtrn), progress(adam(model,repeat(dtrn,25)))))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

trainresults (generic function with 1 method)

In [686]:
zeroone(mymodel1,dtrn)

0.44383983333462895

In [688]:
zeroone(mymodel1,dtst)

0.3700570167371713

In [689]:
zeroone(mymodel2,dtrn)

0.9543924565263019

In [690]:
zeroone(mymodel2,dtst)

0.9578352032370793

In [721]:
collect(mean(mymodel2(x,y)) for (x,y) in dtrn)

8000-element Array{Float64,1}:
 3.4013262946080154
 2.9445139923910455
 2.890014048476892 
 2.833665594836151 
 3.610792494901225 
 2.6392085490833916
 2.6389081854716823
 2.8330951977016197
 3.5836647209932364
 3.044733973843193 
 2.9953588011890853
 3.401196953372921 
 2.9959895680252835
 ⋮                 
 2.197129255745929 
 2.397617231689629 
 3.044770177687923 
 3.135344376546868 
 2.485110164167745 
 2.94452519933547  
 1.6092654086523668
 2.995813691311004 
 2.079744283345211 
 3.0907028855436933
 3.044398750582876 
 2.943909500454036 

In [717]:
@doc Knet.mean

```
mean(itr)
```

Compute the mean of all elements in a collection.

!!! note
    If `itr` contains `NaN` or [`missing`](@ref) values, the result is also `NaN` or `missing` (`missing` takes precedence if array contains both). Use the [`skipmissing`](@ref) function to omit `missing` entries and compute the mean of non-missing values.


# Examples

```jldoctest
julia> mean(1:20)
10.5

julia> mean([1, missing, 3])
missing

julia> mean(skipmissing([1, missing, 3]))
2.0
```

```
mean(f::Function, itr)
```

Apply the function `f` to each element of collection `itr` and take the mean.

```jldoctest
julia> mean(√, [1, 2, 3])
1.3820881233139908

julia> mean([√1, √2, √3])
1.3820881233139908
```

```
mean(A::AbstractArray; dims)
```

Compute the mean of an array over the given dimensions.

!!! compat "Julia 1.1"
    `mean` for empty arrays requires at least Julia 1.1.


# Examples

```jldoctest
julia> A = [1 2; 3 4]
2×2 Array{Int64,2}:
 1  2
 3  4

julia> mean(A, dims=1)
1×2 Array{Float64,2}:
 2.0  3.0

julia> mean(A, dims=2)
2×1 Array{Float64,2}:
 1.5
 3.5
```


In [704]:
dtrn[1][1]

50×1×30 Array{Float64,3}:
[:, :, 1] =
 -0.7589799761772156  
 -0.47426000237464905 
  0.47369998693466187 
  0.7724999785423279  
 -0.7806400060653687  
  0.23232999444007874 
  0.0461140014231205  
  0.8401399850845337  
  0.243709996342659   
  0.022978000342845917
  0.5396400094032288  
 -0.36100998520851135 
  0.9419800043106079  
  ⋮                   
  0.035413000732660294
  0.5883399844169617  
  0.4543899893760681  
 -0.8425400257110596  
  0.10649999976158142 
 -0.059397000819444656
  0.09044899791479111 
  0.30581000447273254 
 -0.6142399907112122  
  0.7895399928092957  
 -0.014116000384092331
  0.6448000073432922  

[:, :, 2] =
  0.542140007019043  
  1.0302000045776367 
  0.8689600229263306 
  0.5001400113105774 
  0.9518200159072876 
 -1.3366999626159668 
 -0.4010699987411499 
  0.3922699987888336 
  0.536620020866394  
  0.48791998624801636
 -0.8468700051307678 
 -0.6293799877166748 
 -1.3402999639511108 
  ⋮                  
 -0.7340899705886841 
  1.3209999799728394 

In [702]:
mymodel2.layers[1].inputSize

50

In [709]:
@doc Knet.cget

No documentation found.

Binding `Knet.cget` does not exist.


In [693]:
asd = ((x,y) for (x,y) in dtrn)

Base.Generator{Array{Tuple{Array{Float64,3},Array{Int64,1}},1},getfield(Main, Symbol("##245#246"))}(getfield(Main, Symbol("##245#246"))(), Tuple{Array{Float64,3},Array{Int64,1}}[([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[0.54214; 1.0302; … ; 0.18317; 0.4338]

[-0.16768; 1.2151; … ; 0.30893; 0.11023]

...

[0.12417; 1.0659; … ; -0.41358; -1.4449]

[0.38784; -0.36196; … ; -0.41707; -1.1415]

[0.15164; 0.30177; … ; 0.016413; 0.10216], [1, 2, 2, 2, 7, 8, 2, 8, 9, 9  …  22, 19, 24, 22, 22, 29, 29, 29, 22, 2]), ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[-0.61201; 0.98226; … ; 0.30649; 0.81745]

[0.53074; 0.40117; … ; 0.1444; 0.23611]

...

[0.5267; 0.11441; … ; -0.2983; 0.020482]

[0.15164; 0.30177; … ; 0.016413; 0.10216]

[-0.6297; 0.69044; … ; 0.25744; 1.0729], [11, 4, 11, 8, 8, 8, 4, 11, 11, 1, 11, 11, 15, 11, 17, 15, 11, 11]), ([-0.75898; -0.47426; … ; -0.014116; 0.6448]

[-0.15362; -0.49874; … ; 0.056634; -0.47371]

[-0.17587; 1.3508; … ; 0.5665; 0.61385]

...

[0.33042; 0.24995

In [695]:
iterate(asd)[1][1]

50×1×30 Array{Float64,3}:
[:, :, 1] =
 -0.7589799761772156  
 -0.47426000237464905 
  0.47369998693466187 
  0.7724999785423279  
 -0.7806400060653687  
  0.23232999444007874 
  0.0461140014231205  
  0.8401399850845337  
  0.243709996342659   
  0.022978000342845917
  0.5396400094032288  
 -0.36100998520851135 
  0.9419800043106079  
  ⋮                   
  0.035413000732660294
  0.5883399844169617  
  0.4543899893760681  
 -0.8425400257110596  
  0.10649999976158142 
 -0.059397000819444656
  0.09044899791479111 
  0.30581000447273254 
 -0.6142399907112122  
  0.7895399928092957  
 -0.014116000384092331
  0.6448000073432922  

[:, :, 2] =
  0.542140007019043  
  1.0302000045776367 
  0.8689600229263306 
  0.5001400113105774 
  0.9518200159072876 
 -1.3366999626159668 
 -0.4010699987411499 
  0.3922699987888336 
  0.536620020866394  
  0.48791998624801636
 -0.8468700051307678 
 -0.6293799877166748 
 -1.3402999639511108 
  ⋮                  
 -0.7340899705886841 
  1.3209999799728394 

In [692]:
mymodel1(dtst)

AssertionError: AssertionError: X == r.inputSize

In [734]:
mymodel2(dtrn,1,1)

2.514376439046779

In [None]:
mymodel2(dtst)

In [662]:
collect(flatten(Knet.progress(Knet.sgd(mymodel1,dtrn))))

1.02e+00  68.22%┣█████████████▋      ┫ 8557/12543 [10:14/14:59, 13.95i/s]33/12543 [00:05/31:35, 6.62i/s]46/12543 [00:06/27:19, 7.65i/s]┫ 51/12543 [00:07/29:04, 7.19i/s]54/12543 [00:08/32:39, 6.40i/s]67/12543 [00:12/37:02, 5.65i/s]79/12543 [00:14/37:56, 5.51i/s]121/12543 [00:16/28:22, 7.37i/s]137/12543 [00:19/28:58, 7.22i/s]147/12543 [00:20/28:29, 7.34i/s]┫ 156/12543 [00:21/28:12, 7.41i/s]163/12543 [00:22/28:34, 7.32i/s]185/12543 [00:24/27:27, 7.62i/s]202/12543 [00:27/27:37, 7.57i/s]213/12543 [00:28/27:11, 7.69i/s]239/12543 [00:31/27:11, 7.69i/s]┫ 249/12543 [00:32/26:57, 7.76i/s]265/12543 [00:34/27:10, 7.69i/s]283/12543 [00:38/28:17, 7.39i/s]326/12543 [00:48/31:01, 6.74i/s]┫ 366/12543 [00:50/28:49, 7.25i/s]392/12543 [00:53/28:10, 7.42i/s]412/12543 [00:55/27:50, 7.51i/s]443/12543 [00:59/28:02, 7.46i/s]492/12543 [01:04/27:06, 7.71i/s]501/12543 [01:05/27:03, 7.73i/s]507/12543 [01:06/27:09, 7.70i/s]534/12543 [01:08/26:34, 7.87i/s]556/12543 [01:09/25:56, 8.06i/s]577/12543 [01:12/26:15, 7.96i

12543-element Array{Float64,1}:
 3.3972679682479403 
 2.944077958326243  
 2.8880662027033206 
 2.8314236571895517 
 3.610724608512998  
 2.6377387327405724 
 2.6357795900340952 
 2.831147497143261  
 3.582933695115336  
 3.04301685864481   
 2.994654805826147  
 3.4008567352251506 
 2.9949588454527585 
 ⋮                  
 0.35260356557272277
 0.3669984439431611 
 2.0905570706705277 
 1.3207603488167554 
 1.3575671494623547 
 0.9675048087907924 
 0.7501239384432881 
 1.6698215053740249 
 1.2444769867806043 
 0.8196766186693428 
 1.2821853223097497 
 0.4985160145118432 

In [None]:
r11 = trainresults("myfile1.jld2",mymodel2)

Train from scratch? stdin> y
2.05e+00  1.70%┣▎                 ┫ 3395/200000 [11:04/10:52:17, 5.11i/s]1.40e+00  4.01%┣▋                 ┫ 7992/200000 [21:14/08:51:24, 6.27i/s]3408/200000 [11:06/10:51:52, 5.11i/s]3419/200000 [11:09/10:51:49, 5.11i/s]3458/200000 [11:13/10:48:48, 5.14i/s]3474/200000 [11:14/10:47:07, 5.15i/s]3515/200000 [11:19/10:43:50, 5.18i/s]3537/200000 [11:22/10:42:54, 5.18i/s]3563/200000 [11:26/10:41:22, 5.20i/s]3612/200000 [11:31/10:37:55, 5.23i/s]3623/200000 [11:33/10:37:55, 5.23i/s]3628/200000 [11:34/10:38:01, 5.22i/s]3641/200000 [11:36/10:36:46, 5.23i/s]3678/200000 [11:39/10:33:08, 5.26i/s]3705/200000 [11:44/10:33:18, 5.26i/s]3709/200000 [11:45/10:33:31, 5.26i/s]                 ┫ 3726/200000 [11:49/10:34:18, 5.26i/s]3737/200000 [11:51/10:34:20, 5.25i/s]3753/200000 [11:54/10:34:29, 5.25i/s]3769/200000 [11:58/10:34:35, 5.25i/s]3798/200000 [12:02/10:33:28, 5.26i/s]3802/200000 [12:03/10:33:41, 5.26i/s]3809/200000 [12:05/10:34:24, 5.25i/s]3812/200000 [12:06/10:34:56, 

Excessive output truncated after 524289 bytes.

In [166]:
@doc converge

```
converge(itr; alpha=0.1)
```

Return an iterator which acts exactly like `itr`, but quits when values from `itr` stop decreasing. `itr` should produce numeric values.

It can be used to train a model with the data cycled:

```
progress!(converge(minimize(model,cycle(data))))
```

`alpha` controls the exponential average of values to detect convergence. Here is how convergence is decided:

```
p = x - avgx
avgx = c.alpha * x + (1-c.alpha) * avgx
avgp = c.alpha * p + (1-c.alpha) * avgp
avgp > 0.0 && return nothing
```

`converge!(...)` is equivalent to `(for x in converge(...) end)`, i.e.  iterates over the object created by `converge(...)` and returns `nothing`.


In [167]:
every(n,itr) = (x for (i,x) in enumerate(itr) if i%n == 0);

In [606]:
iterate(data5)[1][2]

29-element Array{Int64,1}:
  1
  2
  2
  2
  7
  8
  2
  8
  9
  9
  9
  9
  9
  ⋮
  8
 22
 22
 19
 24
 22
 22
 29
 29
 29
 22
  2

In [168]:
abc = every(10, 1:100)

Base.Generator{Base.Iterators.Filter{getfield(Main, Symbol("##52#54")){Int64},Base.Iterators.Enumerate{UnitRange{Int64}}},getfield(Main, Symbol("##51#53"))}(getfield(Main, Symbol("##51#53"))(), Base.Iterators.Filter{getfield(Main, Symbol("##52#54")){Int64},Base.Iterators.Enumerate{UnitRange{Int64}}}(getfield(Main, Symbol("##52#54")){Int64}(10), Base.Iterators.Enumerate{UnitRange{Int64}}(1:100)))

In [169]:
@doc Knet.RNN

```
rnn = RNN(inputSize, hiddenSize; opts...)
rnn(x; batchSizes) => y
rnn.h, rnn.c  # hidden and cell states
```

`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers.  Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.

**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.

**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.

**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.

In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.

**Keyword arguments for RNN:**

  * `h=nothing`: Initial hidden state.
  * `c=nothing`: Initial cell state.
  * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.
  * `numLayers=1`: Number of RNN layers.
  * `bidirectional=false`: Create a bidirectional RNN if `true`.
  * `dropout=0`: Dropout probability. Applied to input and between layers.
  * `skipInput=false`: Do not multiply the input with a matrix if `true`.
  * `dataType=Float32`: Data type to use for weights.
  * `algo=0`: Algorithm to use, see CUDNN docs for details.
  * `seed=0`: Random number seed for dropout. Uses `time()` if 0.
  * `winit=xavier`: Weight initialization method for matrices.
  * `binit=zeros`: Weight initialization method for bias vectors.
  * `usegpu=(gpu()>=0)`: GPU used by default if one exists.

**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:

`:relu` and `:tanh`: Single gate RNN with activation function f:

```
h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)
```

`:gru`: Gated recurrent unit:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate
n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate
h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]
```

`:lstm`: Long short term memory unit with no peephole connections:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate
o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate
n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate
c[t] = f[t] .* c[t-1] .+ i[t] .* n[t]               # cell output
h[t] = o[t] .* tanh(c[t])
```


In [170]:
@doc view

```
view(A, inds...)
```

Like [`getindex`](@ref), but returns a view into the parent array `A` with the given indices instead of making a copy.  Calling [`getindex`](@ref) or [`setindex!`](@ref) on the returned `SubArray` computes the indices to the parent array on the fly without checking bounds.

# Examples

```jldoctest
julia> A = [1 2; 3 4]
2×2 Array{Int64,2}:
 1  2
 3  4

julia> b = view(A, :, 1)
2-element view(::Array{Int64,2}, :, 1) with eltype Int64:
 1
 3

julia> fill!(b, 0)
2-element view(::Array{Int64,2}, :, 1) with eltype Int64:
 0
 0

julia> A # Note A has changed even though we modified b
2×2 Array{Int64,2}:
 0  2
 0  4
```


In [171]:
@doc getindex

```
getindex(type[, elements...])
```

Construct a 1-d array of the specified type. This is usually called with the syntax `Type[]`. Element values can be specified using `Type[a,b,c,...]`.

# Examples

```jldoctest
julia> Int8[1, 2, 3]
3-element Array{Int8,1}:
 1
 2
 3

julia> getindex(Int8, 1, 2, 3)
3-element Array{Int8,1}:
 1
 2
 3
```

```
getindex(collection, key...)
```

Retrieve the value(s) stored at the given key or index within a collection. The syntax `a[i,j,...]` is converted by the compiler to `getindex(a, i, j, ...)`.

# Examples

```jldoctest
julia> A = Dict("a" => 1, "b" => 2)
Dict{String,Int64} with 2 entries:
  "b" => 2
  "a" => 1

julia> getindex(A, "a")
1
```

```
getindex(A, inds...)
```

Return a subset of array `A` as specified by `inds`, where each `ind` may be an `Int`, an [`AbstractRange`](@ref), or a [`Vector`](@ref). See the manual section on [array indexing](@ref man-array-indexing) for details.

# Examples

```jldoctest
julia> A = [1 2; 3 4]
2×2 Array{Int64,2}:
 1  2
 3  4

julia> getindex(A, 1)
1

julia> getindex(A, [2, 1])
2-element Array{Int64,1}:
 3
 1

julia> getindex(A, 2:4)
3-element Array{Int64,1}:
 3
 2
 4
```

```
getindex(tree::GitTree, target::AbstractString) -> GitObject
```

Look up `target` path in the `tree`, returning a [`GitObject`](@ref) (a [`GitBlob`](@ref) in the case of a file, or another [`GitTree`](@ref) if looking up a directory).

# Examples

```julia
tree = LibGit2.GitTree(repo, "HEAD^{tree}")
readme = tree["README.md"]
subtree = tree["test"]
runtests = subtree["runtests.jl"]
```

```
v = sd[k]
```

Argument `sd` is a SortedDict and `k` is a key. In an expression, this retrieves the value (`v`) associated with the key (or `KeyError` if none). On the left-hand side of an assignment, this assigns or reassigns the value associated with the key. (For assigning and reassigning, see also `insert!` below.) Time: O(*c* log *n*)

```
cb[i]
```

Get the i-th element of CircularBuffer.

  * `cb[1]` to get the element at the front
  * `cb[end]` to get the element at the back


In [172]:
@doc sort!

```
sort!(v; alg::Algorithm=defalg(v), lt=isless, by=identity, rev::Bool=false, order::Ordering=Forward)
```

Sort the vector `v` in place. `QuickSort` is used by default for numeric arrays while `MergeSort` is used for other arrays. You can specify an algorithm to use via the `alg` keyword (see Sorting Algorithms for available algorithms). The `by` keyword lets you provide a function that will be applied to each element before comparison; the `lt` keyword allows providing a custom "less than" function; use `rev=true` to reverse the sorting order. These options are independent and can be used together in all possible combinations: if both `by` and `lt` are specified, the `lt` function is applied to the result of the `by` function; `rev=true` reverses whatever ordering specified via the `by` and `lt` keywords.

# Examples

```jldoctest
julia> v = [3, 1, 2]; sort!(v); v
3-element Array{Int64,1}:
 1
 2
 3

julia> v = [3, 1, 2]; sort!(v, rev = true); v
3-element Array{Int64,1}:
 3
 2
 1

julia> v = [(1, "c"), (3, "a"), (2, "b")]; sort!(v, by = x -> x[1]); v
3-element Array{Tuple{Int64,String},1}:
 (1, "c")
 (2, "b")
 (3, "a")

julia> v = [(1, "c"), (3, "a"), (2, "b")]; sort!(v, by = x -> x[2]); v
3-element Array{Tuple{Int64,String},1}:
 (3, "a")
 (2, "b")
 (1, "c")
```

```
sort!(A; dims::Integer, alg::Algorithm=defalg(v), lt=isless, by=identity, rev::Bool=false, order::Ordering=Forward)
```

Sort the multidimensional array `A` along dimension `dims`. See [`sort!`](@ref) for a description of possible keyword arguments.

To sort slices of an array, refer to [`sortslices`](@ref).

!!! compat "Julia 1.1"
    This function requires at least Julia 1.1.


# Examples

```jldoctest
julia> A = [4 3; 1 2]
2×2 Array{Int64,2}:
 4  3
 1  2

julia> sort!(A, dims = 1); A
2×2 Array{Int64,2}:
 1  2
 4  3

julia> sort!(A, dims = 2); A
2×2 Array{Int64,2}:
 1  2
 3  4
```


In [173]:
Parse1(;o...) = 
Chain(tirtEmbed(), RNN(te_out, hi; h=0, c=0, o...), hiMLP())

Parse1 (generic function with 1 method)

In [696]:
@doc Knet.RNN

```
rnn = RNN(inputSize, hiddenSize; opts...)
rnn(x; batchSizes) => y
rnn.h, rnn.c  # hidden and cell states
```

`RNN` returns a callable RNN object `rnn`. Given a minibatch of sequences `x`, `rnn(x)` returns `y`, the hidden states of the final layer for each time step. `rnn.h` and `rnn.c` fields can be used to set the initial hidden states and read the final hidden states of all layers.  Note that the final time step of `y` always contains the final hidden state of the last layer, equivalent to `rnn.h` for a single layer network.

**Dimensions:** The input `x` can be 1, 2, or 3 dimensional and `y` will have the same number of dimensions as `x`. size(x)=(X,[B,T]) and size(y)=(H/2H,[B,T]) where X is inputSize, B is batchSize, T is seqLength, H is hiddenSize, 2H is for bidirectional RNNs. By default a 1-D `x` represents a single instance for a single time step, a 2-D `x` represents a single minibatch for a single time step, and a 3-D `x` represents a sequence of identically sized minibatches for multiple time steps. The output `y` gives the hidden state (of the final layer for multi-layer RNNs) for each time step. The fields `rnn.h` and `rnn.c` represent the hidden states of all layers in a single time step and have size (H,B,L/2L) where L is numLayers and 2L is for bidirectional RNNs.

**batchSizes:** If `batchSizes=nothing` (default), all sequences in a minibatch are assumed to be the same length. If `batchSizes` is an array of (non-increasing) integers, it gives us the batch size for each time step (allowing different sequences in the minibatch to have different lengths). In this case `x` will typically be 2-D with the second dimension representing variable size batches for time steps. If `batchSizes` is used, `sum(batchSizes)` should equal `length(x) ÷ size(x,1)`. When the batch size is different in every time step, hidden states will have size (H,B,L/2L) where B is always the size of the first (largest) minibatch.

**Hidden states:** The hidden and cell states are kept in `rnn.h` and `rnn.c` fields (the cell state is only used by LSTM). They can be initialized during construction using the `h` and `c` keyword arguments, or modified later by direct assignment. Valid values are `nothing` (default), `0`, or an array of the right type and size possibly wrapped in a `Param`. If the value is `nothing` the initial state is assumed to be zero and the final state is discarded keeping the value `nothing`. If the value is `0` the initial state is assumed to be zero and `0` is replaced by the final state on return. If the value is a valid state, it is used as the initial state and is replaced by the final state on return.

In a differentiation context the returned final hidden states will be wrapped in `Result` types. This is necessary if the same RNN object is to be called multiple times in a single iteration. Between iterations (i.e. after diff/update) the hidden states need to be unboxed with e.g. `rnn.h = value(rnn.h)` to prevent spurious dependencies. This happens automatically during the backward pass for GPU RNNs but needs to be done manually for CPU RNNs. See the [CharLM Tutorial](https://github.com/denizyuret/Knet.jl/blob/master/tutorial/80.charlm.ipynb) for an example.

**Keyword arguments for RNN:**

  * `h=nothing`: Initial hidden state.
  * `c=nothing`: Initial cell state.
  * `rnnType=:lstm` Type of RNN: One of :relu, :tanh, :lstm, :gru.
  * `numLayers=1`: Number of RNN layers.
  * `bidirectional=false`: Create a bidirectional RNN if `true`.
  * `dropout=0`: Dropout probability. Applied to input and between layers.
  * `skipInput=false`: Do not multiply the input with a matrix if `true`.
  * `dataType=Float32`: Data type to use for weights.
  * `algo=0`: Algorithm to use, see CUDNN docs for details.
  * `seed=0`: Random number seed for dropout. Uses `time()` if 0.
  * `winit=xavier`: Weight initialization method for matrices.
  * `binit=zeros`: Weight initialization method for bias vectors.
  * `usegpu=(gpu()>=0)`: GPU used by default if one exists.

**Formulas:** RNNs compute the output h[t] for a given iteration from the recurrent input h[t-1] and the previous layer input x[t] given matrices W, R and biases bW, bR from the following equations:

`:relu` and `:tanh`: Single gate RNN with activation function f:

```
h[t] = f(W * x[t] .+ R * h[t-1] .+ bW .+ bR)
```

`:gru`: Gated recurrent unit:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
r[t] = sigm(Wr * x[t] .+ Rr * h[t-1] .+ bWr .+ bRr) # reset gate
n[t] = tanh(Wn * x[t] .+ r[t] .* (Rn * h[t-1] .+ bRn) .+ bWn) # new gate
h[t] = (1 - i[t]) .* n[t] .+ i[t] .* h[t-1]
```

`:lstm`: Long short term memory unit with no peephole connections:

```
i[t] = sigm(Wi * x[t] .+ Ri * h[t-1] .+ bWi .+ bRi) # input gate
f[t] = sigm(Wf * x[t] .+ Rf * h[t-1] .+ bWf .+ bRf) # forget gate
o[t] = sigm(Wo * x[t] .+ Ro * h[t-1] .+ bWo .+ bRo) # output gate
n[t] = tanh(Wn * x[t] .+ Rn * h[t-1] .+ bWn .+ bRn) # new gate
c[t] = f[t] .* c[t-1] .+ i[t] .* n[t]               # cell output
h[t] = o[t] .* tanh(c[t])
```


In [648]:
Biaff(rhid::Int, mlph::Int, m::Int) =
    Biaff(MLP(rhid,mlph, m) , MLP(rhid,mlph, m) , param(m,m;atype=Array{Float64}), param0(m;atype=Array{Float64}))

Biaff

In [697]:
fModel(tembed::Int, rnnh::Int, mlp1h::Int, mlp1o::Int; o...) = 
    Chain(RNN(tembed, rnnh; bidirectional=true, rnnType = :lstm, dataType=Float64,o...), Biaff(2*rnnh,mlp1h,mlp1o))

fModel (generic function with 3 methods)

In [650]:
mymodel1 = fModel(50,100,100,50)

Chain((LSTM(input=50,hidden=100,bidirectional), Biaff(Chain((Dense(P(Array{Float64,2}(100,200)), P(Array{Float64,1}(100)), Knet.relu), Dense(P(Array{Float64,2}(50,100)), P(Array{Float64,1}(50)), Knet.relu))), Chain((Dense(P(Array{Float64,2}(100,200)), P(Array{Float64,1}(100)), Knet.relu), Dense(P(Array{Float64,2}(50,100)), P(Array{Float64,1}(50)), Knet.relu))), P(Array{Float64,2}(50,50)), P(Array{Float64,1}(50)))))

In [736]:
mymodel2 = fModel(50,100,100,50)

Chain((LSTM(input=50,hidden=100,bidirectional,Float64), Biaff(Chain((Dense(P(Array{Float64,2}(100,200)), P(Array{Float64,1}(100)), Knet.relu), Dense(P(Array{Float64,2}(50,100)), P(Array{Float64,1}(50)), Knet.relu))), Chain((Dense(P(Array{Float64,2}(100,200)), P(Array{Float64,1}(100)), Knet.relu), Dense(P(Array{Float64,2}(50,100)), P(Array{Float64,1}(50)), Knet.relu))), P(Array{Float64,2}(50,50)), P(Array{Float64,1}(50)))))

In [664]:
mymodel1(wembedmat[:,dat1x],dat1y .+ 1)

1.8001002342890495

In [588]:
data5 = ((wembedmat[:,x],y .+ 1) for (x,y) in data4)

Base.Generator{Base.Generator{Base.Iterators.Zip{Tuple{Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))},Array{Any,1}}},getfield(Main, Symbol("##182#183"))},getfield(Main, Symbol("##192#193"))}(getfield(Main, Symbol("##192#193"))(), Base.Generator{Base.Iterators.Zip{Tuple{Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))},Array{Any,1}}},getfield(Main, Symbol("##182#183"))}(getfield(Main, Symbol("##182#183"))(), Base.Iterators.Zip{Tuple{Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))},Array{Any,1}}}((Base.Generator{Array{Any,1},getfield(Main, Symbol("##128#129"))}(getfield(Main, Symbol("##128#129"))(), Any[Any["Al", "-", "Zaman", ":", "American", "forces", "killed", "Shaikh", "Abdullah", "al"  …  "the", "town", "of", "Qaim", ",", "near", "the", "Syrian", "border", "."], Any["[", "This", "killing", "of", "a", "respected", "cleric", "will", "be", "causing", "us", "trouble", "for", "years", "to", "come", ".", "]"], Any["DPA", ":", "Iraqi", "auth

In [676]:
mymodel2(dtrn[1151]...)

3.3327302102828806

In [592]:
iterate(data5)[1][2]

29-element Array{Int64,1}:
  1
  2
  2
  2
  7
  8
  2
  8
  9
  9
  9
  9
  9
  ⋮
  8
 22
 22
 19
 24
 22
 22
 29
 29
 29
 22
  2

In [548]:
wembedmat[:,dat1x]

50×1×30 Array{Float64,3}:
[:, :, 1] =
 -0.7589799761772156  
 -0.47426000237464905 
  0.47369998693466187 
  0.7724999785423279  
 -0.7806400060653687  
  0.23232999444007874 
  0.0461140014231205  
  0.8401399850845337  
  0.243709996342659   
  0.022978000342845917
  0.5396400094032288  
 -0.36100998520851135 
  0.9419800043106079  
  ⋮                   
  0.035413000732660294
  0.5883399844169617  
  0.4543899893760681  
 -0.8425400257110596  
  0.10649999976158142 
 -0.059397000819444656
  0.09044899791479111 
  0.30581000447273254 
 -0.6142399907112122  
  0.7895399928092957  
 -0.014116000384092331
  0.6448000073432922  

[:, :, 2] =
  0.542140007019043  
  1.0302000045776367 
  0.8689600229263306 
  0.5001400113105774 
  0.9518200159072876 
 -1.3366999626159668 
 -0.4010699987411499 
  0.3922699987888336 
  0.536620020866394  
  0.48791998624801636
 -0.8468700051307678 
 -0.6293799877166748 
 -1.3402999639511108 
  ⋮                  
 -0.7340899705886841 
  1.3209999799728394 

In [546]:
dat1 = iterate(data4)[1]

([399999 319 … 719 3], Any[0, 1, 1, 1, 6, 7, 1, 7, 8, 8  …  21, 18, 23, 21, 21, 28, 28, 28, 21, 1])

In [547]:
dat1x,dat1y = dat1

([399999 319 … 719 3], Any[0, 1, 1, 1, 6, 7, 1, 7, 8, 8  …  21, 18, 23, 21, 21, 28, 28, 28, 21, 1])

In [534]:
dat1y

29-element Array{Any,1}:
  0
  1
  1
  1
  6
  7
  1
  7
  8
  8
  8
  8
  8
  ⋮
  7
 21
 21
 18
 23
 21
 21
 28
 28
 28
 21
  1

In [175]:
rnn1 = RNN(3, 5; bidirectional=true, rnnType = :lstm)

LSTM(input=3,hidden=5,bidirectional)

In [176]:
mlp1 = MLP()

MethodError: MethodError: no method matching MLP()
Closest candidates are:
  MLP(!Matched::Int64, !Matched::Int64, !Matched::Int64) at In[118]:1

In [177]:
input = Knet.xavier(3,2,5)

3×2×5 Array{Float64,3}:
[:, :, 1] =
 0.17894     0.196969 
 0.0921046  -0.0318884
 0.0704419  -0.321619 

[:, :, 2] =
  0.115746   0.398939
  0.331386  -0.34378 
 -0.230788  -0.247579

[:, :, 3] =
  0.0822033  -0.20733 
 -0.201223    0.258837
  0.190816    0.115263

[:, :, 4] =
 -0.150135   -0.123681  
 -0.415456   -0.00392453
  0.0200101   0.193123  

[:, :, 5] =
  0.139295   -0.238648
 -0.174997   -0.106833
  0.0394548   0.35436 

In [178]:
rnn1(input)

10×2×5 Array{Float64,3}:
[:, :, 1] =
 -0.0128885    0.00329075 
  0.0343607   -0.0249696  
  0.00397492  -0.0085281  
 -0.00670756  -0.0269048  
  0.00658488  -0.0289667  
 -0.0148214    0.000935341
 -0.0206585    0.00638483 
  0.014658    -0.0264543  
  0.025681     0.0467773  
 -0.0156895   -0.0252407  

[:, :, 2] =
 -0.0433838    0.0326755 
  0.0460932   -0.0444737 
 -0.012586    -0.00552732
 -0.00820949  -0.0684148 
 -0.032245    -0.0210779 
  0.0128878   -0.00412239
 -0.00738294  -0.00678317
 -0.00307534  -0.019106  
  0.0389119   -0.00107047
 -0.0194554   -0.0142148 

[:, :, 3] =
 -0.00957498    0.00542634
  0.0319245     0.00889233
  0.000888995  -0.0145068 
 -0.0105941    -0.0111612 
  0.011582     -0.0128324 
 -0.0291192     0.0331274 
 -0.0157241    -0.00833054
 -0.023667      0.0269522 
 -0.0583481    -0.0301172 
  0.00321347    0.0250973 

[:, :, 4] =
  0.0285486    0.00450237
 -0.0386955    0.0150998 
  0.00664119  -0.00145946
 -0.00249189   0.0052149 
  0.0123522    0.009

In [179]:
tmodel = fModel(20, 5, 10, 10, 10, 10, 10)

Chain((LSTM(input=20,hidden=10,bidirectional),))

In [180]:
input = Knet.xavier(3,1,5)
[Knet.KnetArray32(x) for x in input]

UndefVarError: UndefVarError: KnetArray32 not defined

In [181]:
arrin = atype(zeros(2, 1, 2))
arrin[:,1, 1] = [1. 2.]
#arrin[:,1, 2] = tirtembed("dfhgjfyk")

1×2 Array{Float64,2}:
 1.0  2.0

In [182]:
tirtembed("dfhgdfjfyk")

20×1 Array{Float32,2}:
  0.8608544  
  0.8492885  
 -0.48142886 
  2.665367   
 -0.050460823
 -0.72113055 
  1.3265398  
  0.6482137  
 -1.4457464  
 -1.255041   
  0.41842613 
 -0.41712037 
 -0.025142353
 -0.18107042 
 -0.16562504 
 -0.19828318 
 -0.25851956 
  1.1739769  
 -1.1310492  
 -0.013492593

In [183]:
tmodel(reshape([tirtembed("dfhgdfjfyk") tirtembed("dfhgdfjfyk")], 20, 1, 2))

20×1×2 Array{Float32,3}:
[:, :, 1] =
  0.20342048 
 -0.23905772 
 -0.096778885
 -0.13804154 
  0.062011335
 -0.19228645 
  0.010738692
  0.05998249 
  0.14119059 
  0.07197984 
 -0.01608018 
 -0.15639299 
 -0.17113964 
  0.14826685 
  0.10379549 
 -0.06747454 
 -0.17891692 
  0.22242771 
 -0.06071747 
 -0.22232723 

[:, :, 2] =
  0.20199183  
 -0.3184071   
 -0.1432167   
 -0.23403578  
  0.10931119  
 -0.23149896  
 -0.004477276 
  0.07848018  
  0.21460247  
  0.10378643  
  0.0033652722
 -0.11604335  
 -0.12570937  
  0.0892244   
  0.0703403   
 -0.03680204  
 -0.118299015 
  0.17206894  
 -0.033298485 
 -0.20176002  

In [184]:
reshape([tirtembed("dfhgdfjfyk") tirtembed("dfasdgsdffjfyk")], 20, 1, 2)

20×1×2 Array{Float32,3}:
[:, :, 1] =
  0.8608544  
  0.8492885  
 -0.48142886 
  2.665367   
 -0.050460823
 -0.72113055 
  1.3265398  
  0.6482137  
 -1.4457464  
 -1.255041   
  0.41842613 
 -0.41712037 
 -0.025142353
 -0.18107042 
 -0.16562504 
 -0.19828318 
 -0.25851956 
  1.1739769  
 -1.1310492  
 -0.013492593

[:, :, 2] =
 -0.98158425 
  0.6077837  
 -0.2735724  
 -1.1433774  
  0.6099511  
  0.074774876
  1.4331263  
  0.7742791  
 -0.89767945 
  0.61602724 
  0.5874392  
  0.23272926 
  0.37268496 
  1.4274784  
 -1.1315342  
  0.8104872  
 -0.76209766 
 -2.426628   
 -0.23896657 
  1.1319513  

In [185]:
@doc Knet.minibatch

```
minibatch(x, [y], batchsize; shuffle, partial, xtype, ytype, xsize, ysize)
```

Return an iterator of minibatches [(xi,yi)...] given data tensors x, y and batchsize.  

The last dimension of x and y give the number of instances and should be equal. `y` is optional, if omitted a sequence of `xi` will be generated rather than `(xi,yi)` tuples.  Use `repeat(d,n)` for multiple epochs, `Iterators.take(d,n)` for a partial epoch, and `Iterators.cycle(d)` to cycle through the data forever (this can be used with `converge`). If you need the iterator to continue from its last position when stopped early (e.g. by a break in a for loop), use `Iterators.Stateful(d)` (by default the iterator would restart from the beginning).

Keyword arguments:

  * `shuffle=false`: Shuffle the instances every epoch.
  * `partial=false`: If true include the last partial minibatch < batchsize.
  * `xtype=typeof(x)`: Convert xi in minibatches to this type.
  * `ytype=typeof(y)`: Convert yi in minibatches to this type.
  * `xsize=size(x)`: Convert xi in minibatches to this shape.
  * `ysize=size(y)`: Convert yi in minibatches to this shape.


In [186]:
linmod = Linear(20,5)

Linear(P(Array{Float32,2}(5,20)), P(Array{Float32,1}(5)))

In [187]:
linmod |> Knet.params

2-element Array{Param,1}:
 P(Array{Float32,2}(5,20))
 P(Array{Float32,1}(5))   

In [188]:
linput = zeros(20,1)

20×1 Array{Float64,2}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [189]:
linput2 = Knet.KnetArray{Float32,2}(linput)

ErrorException: KnetPtr: bad device id -1.

In [190]:
linmod(atype(tirtembed("dfhgdfjfyk")))

5×1 Array{Float32,2}:
 -0.20123664
 -0.5370126 
  0.7677018 
  0.76841515
 -0.33187535

In [191]:
d = []
push!(d, (4, 6))

1-element Array{Any,1}:
 (4, 6)