In [1]:
using  Languages, TextAnalysis, Flux, PyPlot, Statistics, MLDataUtils, Embeddings

# function to return the index of the word in the word dictionary
tk_idx(s) = haskey(word_dict, s) ? i=word_dict[s] : i=0

# Padding the corpus wrt the longest document
function pad_corpus(c, pad_size)
    M=[]
    for doc in 1:length(c)
        tks = tokens(c[doc])
        if length(tks)>=pad_size
            tk_indexes=[tk_idx(w) for w in tks[1:pad_size]]
        end
        if length(tks)<pad_size
            tk_indexes=zeros(Int64,pad_size-length(tks))
            tk_indexes=vcat(tk_indexes, [tk_idx(w) for w in tks])
        end
        doc==1 ? M=tk_indexes' : M=vcat(M, tk_indexes')
    end
    return M
end

accuracy(x, y) = mean(x .== y)

loss(x, y) = sum(Flux.binarycrossentropy.(m(x), y))

map_binary_encoding(labels) = [label == "__label__1" ? 0 : 1 for label in labels ] 
optimizer = opt = ADAM(0.05)

┌ Info: Precompiling TextAnalysis [a2db99b7-8b79-58f8-94bf-bbc811eef33d]
└ @ Base loading.jl:1273
┌ Info: Precompiling Flux [587475ba-b771-5e3f-ad9e-33799f191a9c]
└ @ Base loading.jl:1273
┌ Info: Precompiling PyPlot [d330b81b-6aea-500a-939a-2ce795aea3ee]
└ @ Base loading.jl:1273
┌ Info: Precompiling MLDataUtils [cc2ba9b6-d476-5e6d-8eaf-a92d5412d41d]
└ @ Base loading.jl:1273


ADAM(0.05, (0.9, 0.999), IdDict{Any,Any}())

In [2]:
data_path = "data/amazon_reviews.txt"
τ = 500 # max Iterations
# M = 50 # max_features

f = open(data_path)
doc_array = readlines(f)[1:500]

labels, texts = [], []
for doc in doc_array
    content = split(doc)
    push!(labels,content[1])
    push!(texts,join(content[2:end]," "))
end

# pushing the text from the files to the string documents
docs=[]
for i in 1:length(texts)
    push!(docs, StringDocument(texts[i]))
end


In [42]:
const embtable = load_embeddings(GloVe{:en},3) # or load_embeddings(FastText_Text) or ...
#Function to return the index of the word in the embedding (returns 0 if the word is not found)
const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab))

function get_embedding(word)
    ind = get_word_index[word]
    emb = embtable.embeddings[:,ind]
    return emb
end



get_embedding (generic function with 1 method)

In [43]:
embeddings = embtable.embeddings
vocab = embtable.vocab
embed_size, max_features = size(embeddings)

(200, 400000)

In [45]:
# Building Flux Embeddings
max_features = 200


200

In [54]:
# building a Corpus
corpus=Corpus(docs)

# updating the lexicon and creating the word dict
update_lexicon!(corpus)
doc_term_matrix=DocumentTermMatrix(corpus)
word_dict = doc_term_matrix.column_indices
# splitting words in the document
word_docs = map(s -> split(s,r"[,. ]",keepempty=false),texts)
# pad size is the number of words in the maximum word document
# Can set a fixed length or the max doc length
# pad_size = maximum(length(word_docs[i]) for i in 1:length(texts)) 
pad_size = 5
# padding the docs
padded_docs = pad_corpus(corpus, pad_size)
# forming the data with the labels
x = padded_docs'
y = map_binary_encoding(labels)
data = [(x, y)]

# Building Flux Embeddings
N = size(padded_docs,1)  #Number of documents
# features per word to learn, depends on the size of the corpus, larger corpus will probably need a higher dimension
# max_features = M
# number of words in the vocabulary, should always be higher than the maximum index in our dictionary.
ν = maximum(word_dict)[2] + 1
vocab_size = ν

8063

In [47]:
embedding_matrix=Flux.glorot_normal(max_features, vocab_size)

function vec_idx(s)
    i=findfirst(x -> x==s, vocab)
    i==nothing ? i=0 : i 
end

for term in doc_term_matrix.terms
    if vec_idx(term)!=0
        embedding_matrix[:,word_dict[term]+1]=get_embedding(term)
    end
end 

In [57]:
# Enabling Flux



m = Chain(x -> embedding_matrix * Flux.onehotbatch(reshape(x, pad_size*N), 0:vocab_size-1),
    x -> reshape(x, max_features, pad_size, N),
    x -> sum(x, dims=2),
    x -> reshape(x, max_features, N),
    Dense(max_features,1,σ)
)

loss_h=[]
accuracy_train=[]

for epoch in 1:100
    Flux.train!(loss, Flux.params(m), data, optimizer)
    println(loss(x, y), " ", accuracy(m(x).>0.5,y))
    push!(loss_h, loss(x, y))
    push!(accuracy_train, accuracy(m(x).>0.5,y))
end

print(m(x).>0.5, accuracy(m(x).>0.5,y))

511667.3 0.49412
271282.0 0.494312
240539.44 0.505544
341568.88 0.505832
288060.3 0.505784
188451.64 0.504224
219217.62 0.49448
274076.56 0.494288
249651.9 0.494336
189548.7 0.495608
190446.5 0.505256
231436.03 0.505712
231554.95 0.505712
194984.1 0.505448
177951.2 0.497672
202784.88 0.49436
214996.75 0.494312
195465.36 0.49448
176498.28 0.499064
187605.66 0.505496
201259.0 0.505736
192641.28 0.505664
177095.52 0.503768
180446.47 0.495296
191549.31 0.49448
188113.36 0.494504
176994.94 0.496352
177863.28 0.504248
185630.16 0.505496
183855.78 0.50552
176164.38 0.503984
176709.19 0.4964
182119.61 0.494744
180494.78 0.494792
175262.92 0.497336
176358.27 0.504344
179839.1 0.505256
177879.97 0.50492
174636.6 0.501728
176262.9 0.495968
178129.86 0.49508
175960.55 0.496088
174455.44 0.501464
176210.03 0.504752
176618.28 0.504896
174731.78 0.503264
174632.34 0.497264
175936.47 0.495512
175310.88 0.49592
174191.33 0.500096
174880.47 0.504272
175328.81 0.504608
174389.55 0.503192
174215.72 0.4981

In [58]:
m(x)'

500×1 LinearAlgebra.Adjoint{Float32,Array{Float32,2}}:
 0.50189704
 0.5186824 
 0.500922  
 0.49305755
 0.49638537
 0.49440265
 0.49492964
 0.50299513
 0.5069653 
 0.50857854
 0.491336  
 0.5063892 
 0.4781044 
 ⋮         
 0.5273178 
 0.5022925 
 0.51441485
 0.5055788 
 0.5116832 
 0.5591339 
 0.50550133
 0.4748863 
 0.4912404 
 0.5156409 
 0.4975297 
 0.48050123

In [21]:
Flux.onehotbatch(reshape(x, pad_size*N), 0:vocab_size-1)

2607×17500 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}:
 1  1  1  1  1  1  1  1  1  1  1  1  1  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  1  0  1  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0 

In [23]:
word_dict

Dict{String,Int64} with 2606 entries:
  "1"           => 20
  "Bill"        => 123
  "neither."    => 1740
  "Bateman"     => 118
  "Cape"        => 146
  "doctor"      => 1073
  "enjoy"       => 1124
  "chocolate"   => 882
  "fight"       => 1225
  "spent"       => 2259
  "regular"     => 2021
  "culture."    => 999
  "artisan"     => 703
  "favorites"   => 1209
  "frustrating" => 1285
  "loosely"     => 1618
  "haze"        => 1386
  "par."        => 1840
  "step"        => 2278
  "Many"        => 340
  "download"    => 1088
  "gives"       => 1321
  "irrelevant"  => 1508
  "lean"        => 1572
  "poised"      => 1912
  ⋮             => ⋮

In [11]:
labelenc(labels,LabelEnc.TrueFalse)

MethodError: MethodError: no method matching labelenc(::MLLabelUtils.LabelEnc.NativeLabels{Any,2,typeof(identity)}, ::Type{MLLabelUtils.LabelEnc.TrueFalse})
Closest candidates are:
  labelenc(::Any) at /Users/asharma19/.julia/packages/MLLabelUtils/g0wUZ/src/labelencoding.jl:385

In [12]:
labels

MLLabelUtils.LabelEnc.NativeLabels{Any,2,typeof(identity)}(identity, Any["__label__2", "__label__1"], Dict{Any,Int64}("__label__2" => 1,"__label__1" => 2))