In [1]:
using  Languages, TextAnalysis, Flux, PyPlot, Statistics, MLDataUtils, Embeddings, MLLabelUtils

# function to return the index of the word in the word dictionary
tk_idx(s) = haskey(word_dict, s) ? i=word_dict[s] : i=0

# Padding the corpus wrt the longest document
function pad_corpus(c, pad_size)
    M=[]
    for doc in 1:length(c)
        tks = tokens(c[doc])
        if length(tks)>=pad_size
            tk_indexes=[tk_idx(w) for w in tks[1:pad_size]]
        end
        if length(tks)<pad_size
            tk_indexes=zeros(Int64,pad_size-length(tks))
            tk_indexes=vcat(tk_indexes, [tk_idx(w) for w in tks])
        end
        doc==1 ? M=tk_indexes' : M=vcat(M, tk_indexes')
    end
    return M
end

accuracy(x, y, model) = mean(Flux.onecold(model(x)) .== Flux.onecold(y))

loss(x, y) = sum(Flux.binarycrossentropy.(m(x), y))

# map_binary_encoding(labels) = [label == "__label__1" ? 0 : 1 for label in labels ] 
optimizer = ADAM(0.1)

ADAM(0.1, (0.9, 0.999), IdDict{Any,Any}())

In [2]:
data_path = "data/amazon_reviews.txt"
τ = 500 # max Iterations
# M = 50 # max_features

f = open(data_path)
doc_array = readlines(f)

labels, texts = [], []
for doc in doc_array
    content = split(doc)
    push!(labels,content[1])
    push!(texts,join(content[2:end]," "))
end

# pushing the text from the files to the string documents
docs=[]
for i in 1:length(texts)
    push!(docs, StringDocument(texts[i]))
end


In [3]:
const embtable = load_embeddings(GloVe{:en},4) # or load_embeddings(FastText_Text) or ...
#Function to return the index of the word in the embedding (returns 0 if the word is not found)
const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab))

function get_embedding(word)
    ind = get_word_index[word]
    emb = embtable.embeddings[:,ind]
    return emb
end

get_embedding (generic function with 1 method)

In [4]:
embeddings = embtable.embeddings
vocab = embtable.vocab
embed_size, max_features = size(embeddings)

(300, 400000)

In [5]:
# Building Flux Embeddings
max_features = 300


300

In [22]:
# building a Corpus
corpus=Corpus(docs)

# updating the lexicon and creating the word dict
update_lexicon!(corpus)
doc_term_matrix=DocumentTermMatrix(corpus)
word_dict = doc_term_matrix.column_indices
# splitting words in the document
word_docs = map(s -> split(s,r"[,. ]",keepempty=false),texts)
# pad size is the number of words in the maximum word document

# Can set a fixed length or the max doc length
# pad_size = maximum(length(word_docs[i]) for i in 1:length(texts)) 
pad_size = 70

# padding the docs
padded_docs = pad_corpus(corpus, pad_size)
# forming the data with the labels
x = padded_docs'
# train_indices = [1:3:150 ; 2:3:150]
# X_train = x[:, train_indices]
# y_train = labels[:, train_indices]
# X_test = x[:, 3:3:150]
# y_test = labels[:, 3:3:150]
(X_train, y_train), (X_test, y_test) = splitobs((x, labels); at = 0.67)
# x_train = Array(transpose(X_train1))
# y_train = Array(y_train1)
# x_test = Array(transpose(X_test1))
# y_test = Array(y_test1)
klasses = sort(unique(labels))
y_train = Flux.onehotbatch(y_train,klasses)
data = [(X_train, y_train)]

# Building Flux Embeddings
N = size(X_train,2)  #Number of documents
# features per word to learn, depends on the size of the corpus, larger corpus will probably need a higher dimension
# max_features = M
# number of words in the vocabulary, should always be higher than the maximum index in our dictionary.
ν = maximum(word_dict)[2] + 1
vocab_size = ν

54077

In [18]:
# Embedding layer for Flux model
# glorot_normal returns an Array of size dims containing random variables taken from a normal distribution with mean 0 and standard deviation (2 / sum(dims)).
embedding_matrix=Flux.glorot_normal(max_features, vocab_size)

function vec_idx(s)
    i=findfirst(x -> x==s, vocab)
    i==nothing ? i=0 : i 
end


for term in doc_term_matrix.terms
    if vec_idx(term)!=0
        embedding_matrix[:,word_dict[term]+1]=get_embedding(term)
    end
end 

In [19]:
# Enabling Flux



m = Chain(x -> embedding_matrix * Flux.onehotbatch(reshape(x, pad_size*N), 0:vocab_size-1),
          x -> reshape(x, max_features, pad_size, N),
          x -> sum(x, dims=2),
          x -> reshape(x, max_features, N),
        LSTM(max_features,100),
        Dense(100,50,relu),
        Dense(50,1,σ)
)


Chain(#25, #26, #27, #28, Recur(LSTMCell(300, 100)), Dense(100, 50, relu), Dense(50, 1, σ))

In [None]:
loss_h=[]
accuracy_train=[]

for epoch in 1:100
    Flux.train!(loss, Flux.params(m), data, optimizer)
    println(loss(X_train, y_train), " ", accuracy(X_train, y_train, m))
    push!(loss_h, loss(X_train, y_train))
    push!(accuracy_train, accuracy(X_train, y_train, m))
end

print(Flux.onecold(m(X_train)), accuracy(X_train, y_train, m))

In [21]:
Flux.onehotbatch(reshape(x, pad_size*N), 0:vocab_size-1)

2607×17500 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
 1  1  1  1  1  1  1  1  1  1  1  1  1  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  1  0  1  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0 

In [23]:
word_dict

Dict{String,Int64} with 2606 entries:
  "1"           => 20
  "Bill"        => 123
  "neither."    => 1740
  "Bateman"     => 118
  "Cape"        => 146
  "doctor"      => 1073
  "enjoy"       => 1124
  "chocolate"   => 882
  "fight"       => 1225
  "spent"       => 2259
  "regular"     => 2021
  "culture."    => 999
  "artisan"     => 703
  "favorites"   => 1209
  "frustrating" => 1285
  "loosely"     => 1618
  "haze"        => 1386
  "par."        => 1840
  "step"        => 2278
  "Many"        => 340
  "download"    => 1088
  "gives"       => 1321
  "irrelevant"  => 1508
  "lean"        => 1572
  "poised"      => 1912
  ⋮             => ⋮

In [77]:
klasses = sort(unique(labels))
ll = labelenc(Flux.onehotbatch(labels,klasses))

MLLabelUtils.LabelEnc.OneOfK{Bool,2}()

In [69]:
labelmap(labels)

Dict{Any,Array{Int64,1}} with 2 entries:
  "__label__2" => [1, 2, 3, 4, 5, 6, 8, 9, 10, 12  …  482, 483, 487, 488, 495, …
  "__label__1" => [7, 11, 14, 15, 16, 20, 21, 23, 26, 27  …  480, 484, 485, 486…

In [84]:
Flux.onehotbatch(labels, klasses)

2×500 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
 0  0  0  0  0  0  1  0  0  0  1  0  0  …  1  1  1  1  1  1  0  0  0  0  0  0
 1  1  1  1  1  1  0  1  1  1  0  1  1     0  0  0  0  0  0  1  1  1  1  1  1

In [18]:
x

70×500 LinearAlgebra.Adjoint{Int64,Array{Int64,2}}:
 1943  2036   214   737  1686  2511  …     0  1047  1674   885   687  1442
 3963  2793     1  1897    13  2335        0  8043  7350  7394  1028  5502
 4274  6965   128   128  1629  5362        0  3726  2525  2856  1484  5583
 7350  3966  2048  1017  2308   128        0     8  2540  7467  1197  7341
 5620  7467  6965  7578  1114  1017        0  7244  6501  2324   128  7469
  128  2549  4968  5199  1502  2497  …   346  4572  4274  8049   705   128
 2048   128  5550  7394  2036  6244     2856  7394  2324  3121  4810  1950
 6961  1017  4132  6965   809  7204     5705    13  6308   128  1510    13
 7518     8  5542  2518   194  2542      705  4404  4845   705  1248  2324
 7828  5299  5680  1017   967  5680     4274  4978   128  4810  4968  4161
 2744  6302  2466  3906  4978  8043  …  7350     1  1017  1510  2534  5680
    1  2324  7444  7758   128  2382     4393   128  7388  1248  4482  7350
 1081  5269    13  4375  1047  7256     6298  20

In [9]:
X_train

70×6700 view(::LinearAlgebra.Adjoint{Int64,Array{Int64,2}}, :, 1:6700) with eltype Int64:
 15788  16415   1531   6144  13827  …      0      0  15190    937      0
 29004  21242      1  15454     31         0      0  31128  10754      0
 30694  47554    903    903  13268         0      0  48458  16295      0
 49788  29022  16463   8403  18254         0      0  44138      1      0
 39074  50409  47554  51096   9041         0      0    903    903      0
   903  19754  34905  36220  12018  …      0      0  16463   1973      0
 16463    903  38572  50034  16415         0      0  34905   5524      0
 47531   8403  29930  47554   6682         0      0  26187  49300      0
 50708     19  38491  19524   1336         0      0  19484  49788      0
 52809  36758  39526   8403   7985         0  11810  19938  51913      0
 20987  43541  19238  28617  34954  …      0  51941  31293  43531      0
     1  18357  50287  52387    903         0  50409   8859  39659      0
  8859  36602     31  31292   8580

In [20]:
reshape(x, pad_size*N)

35000-element reshape(::LinearAlgebra.Adjoint{Int64,Array{Int64,2}}, 35000) with eltype Int64:
 1943
 3963
 4274
 7350
 5620
  128
 2048
 6961
 7518
 7828
 2744
    1
 1081
    ⋮
 7421
 7350
 3927
 6194
 6627
 7242
 2518
    2
 4645
 3839
    2
 7952

In [9]:
EmbeddingLayer(max_features, vocab_size)

MethodError: MethodError: no method matching size(::EmbeddingLayer)
Closest candidates are:
  size(!Matched::BitArray{1}) at bitarray.jl:77
  size(!Matched::BitArray{1}, !Matched::Integer) at bitarray.jl:81
  size(!Matched::Core.Compiler.StmtRange) at show.jl:1598
  ...

In [16]:
typeof(EmbeddingLayer)

DataType

In [16]:
size(X_train,2)*70

469000