In [1]:
using Distances, Statistics
using MultivariateStats
using PyPlot
using WordTokenizers
using TextAnalysis
using DelimitedFiles
using Embeddings

In [2]:
const embtable = load_embeddings(GloVe{:en},1) # or load_embeddings(FastText_Text) or ...
#Function to return the index of the word in the embedding (returns 0 if the word is not found)
const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab))


Dict{String,Int64} with 400000 entries:
  "newdigate"   => 164100
  "daufuskie"   => 254784
  "single-arm"  => 192007
  "titration"   => 117879
  "qajar"       => 66399
  "pinheiro"    => 38763
  "hospitalet"  => 282158
  "kennedale"   => 223560
  "tetracyclic" => 353804
  "moher"       => 167242
  "entomb"      => 205770
  "vanderwerff" => 103628
  "whiz"        => 27921
  "hi5"         => 281945
  "johnswort"   => 373288
  "11-storey"   => 214529
  "clapboards"  => 108701
  "saïd"        => 121097
  "nóg"         => 199967
  "re-connect"  => 356829
  "raghunathan" => 257153
  "13-michael"  => 314161
  "droeshout"   => 360036
  "sobre"       => 33750
  "cavens"      => 309349
  ⋮             => ⋮

In [3]:
embeddings = embtable.embeddings
vocab = embtable.vocab
vec_size, vocab_size = size(embeddings)
println("Loaded embeddings, each word is represented by a vector with $vec_size features. The vocab size is $vocab_size")

Loaded embeddings, each word is represented by a vector with 50 features. The vocab size is 400000


In [4]:
vec_idx(s) = findfirst(x -> x==s, vocab)
function vec(s) 
    if vec_idx(s)!=nothing
        embeddings[:, vec_idx(s)]
    end    
end

vec (generic function with 1 method)

In [5]:
vec_idx("cheese"); vec("cheese")

50-element Array{Float32,1}:
 -0.053903
 -0.30871 
 -1.3285  
 -0.43342 
  0.31779 
  1.5224  
 -0.6965  
 -0.037086
 -0.83784 
  0.074107
 -0.30532 
 -0.1783  
  1.2337  
  ⋮       
  1.9502  
 -0.53274 
  1.1359  
  0.20027 
  0.02245 
 -0.39379 
  1.0609  
  1.585   
  0.17889 
  0.43556 
  0.68161 
  0.066202

In [6]:
# To compare the similarity of document vectors
cosine(x,y)=1-cosine_dist(x, y)

cosine (generic function with 1 method)

In [7]:
cosine(vec("dog"), vec("puppy")), cosine(vec("man"),vec("boy"))

(0.775492f0, 0.8564431f0)

In [8]:
function closest(v, n=20)
    list=[(x,cosine(embeddings'[x,:], v)) for x in 1:size(embeddings)[2]]
    topn_idx=sort(list, by = x -> x[2], rev=true)[1:n]
    return [vocab[a] for (a,_) in topn_idx]
end

closest (generic function with 2 methods)

In [9]:
closest(vec("elephant"))

20-element Array{String,1}:
 "elephant" 
 "elephants"
 "bird"     
 "crocodile"
 "deer"     
 "sheep"    
 "herd"     
 "pig"      
 "dolphin"  
 "boar"     
 "rabbit"   
 "whale"    
 "cat"      
 "monkey"   
 "lion"     
 "goats"    
 "animals"  
 "animal"   
 "ape"      
 "owl"      

In [10]:
closest(mean([vec("day"), vec("night")]))

20-element Array{String,1}:
 "night"    
 "day"      
 "days"     
 "weekend"  
 "morning"  
 "sunday"   
 "afternoon"
 "saturday" 
 "came"     
 "week"     
 "evening"  
 "coming"   
 "next"     
 "on"       
 "before"   
 "hours"    
 "weeks"    
 "went"     
 "hour"     
 "time"     

In [11]:
blue_to_sky = vec("blue") - vec("sky")
closest(blue_to_sky + vec("grass"))

20-element Array{String,1}:
 "grass"   
 "green"   
 "leaf"    
 "cane"    
 "bamboo"  
 "trees"   
 "grasses" 
 "tree"    
 "yellow"  
 "lawn"    
 "cotton"  
 "lawns"   
 "red"     
 "pink"    
 "farm"    
 "turf"    
 "vine"    
 "rubber"  
 "soft"    
 "chestnut"

In [12]:
closest(vec("man") - vec("woman") + vec("queen"))

20-element Array{String,1}:
 "queen"     
 "king"      
 "prince"    
 "crown"     
 "coronation"
 "royal"     
 "knight"    
 "lord"      
 "lady"      
 "ii"        
 "great"     
 "majesty"   
 "honour"    
 "name"      
 "palace"    
 "crowned"   
 "famous"    
 "throne"    
 "dragon"    
 "named"     

In [13]:
txt = open("pg345.txt") do file
    read(file, String)
end
println("Loaded Dracula, length=$(length(txt)) characters")

Loaded Dracula, length=883114 characters


In [14]:
txt = replace(txt, r"\n|\r|_|,|—" => " ")
txt = replace(txt, r"[\"”“*();!]" => "")
sd=StringDocument(txt)
prepare!(sd, strip_whitespace)
sentences = split_sentences(sd.text)
i=1
for s in 1:length(sentences)
    if length(split(sentences[s]))>3
        sentences[i]=lowercase(replace(sentences[s], "."=>""))
        i+=1
    end
end


In [15]:
sentences[1000:1010]

11-element Array{SubString{String},1}:
 "i can fancy what a wonderful power he must have over his patients"                                                          
 "he has a curious habit of looking one straight in the face as if trying to read one's thoughts"                             
 "he tries this on very much with me but i flatter myself he has got a tough nut to crack"                                    
 "i know that from my glass"                                                                                                  
 "do you ever try to read your own face?"                                                                                     
 "i do and i can tell you it is not a bad study and gives you more trouble than you can well fancy if you have never tried it"
 "he says that i afford him a curious psychological study and i humbly think i do"                                            
 "i do not as you know take sufficient interest in dress to be able to d

In [16]:
# Function to get a mean sentence vector
function sentvec(s) 
    local arr=[]
    for w in split(sentences[s])
        if vec(w)!=nothing
            push!(arr, vec(w))
        end
    end
    if length(arr)==0
        ones(Float32, (50,1))*999
    else
        mean(arr)
    end
end

sentvec (generic function with 1 method)

In [22]:
function closest_sent(input_str, n=20)
    mean_vec_input=mean([vec(w) for w in split(input_str)])
    list=[(x,cosine(mean_vec_input, sentvec(x))) for x in 1:length(sentences)]
    topn_idx=sort(list, by = x -> x[2], rev=true)[1:n]
    return [sentences[a] for (a,_) in topn_idx]
end

closest_sent (generic function with 2 methods)

In [19]:
sentences[100]

"there was everywhere a bewildering mass of fruit blossom--apple plum pear cherry and as we drove by i could see the green grass under the trees spangled with the fallen petals"

In [21]:
sentvec(99)

50-element Array{Float32,1}:
  0.3447293   
  0.39965677  
 -0.054723457 
 -0.07291292  
  0.21394199  
  0.15642972  
 -0.49596983  
 -0.24674776  
 -0.23787305  
 -0.4288543   
 -0.314565    
 -0.18126178  
 -0.15339927  
  ⋮           
  0.08461739  
 -0.20704514  
 -0.22955278  
 -0.011368492 
  0.03529108  
  0.057512715 
 -0.0074529666
  0.02252327  
  0.037329756 
 -0.52179056  
 -0.076994695 
 -0.49725753  

In [23]:
closest_sent("my favorite food is strawberry ice cream")

20-element Array{SubString{String},1}:
 "We get hot soup or coffee or tea and off we go."                                                                                                                                                                                                                                                      
 "we get hot soup or coffee or tea and off we go"                                                                                                                                                                                                                                                       
 "drink it off like a good child"                                                                                                                                                                                                                                                                       
 "this with some cheese and a salad and a bottle of old tokay of which

In [24]:
drac_sent_vecs=[]
for s in 1:length(sentences)
    i==1 ? drac_sent_vecs=sentvec(s) : push!(drac_sent_vecs,sentvec(s))
end

In [25]:
writedlm( "drac_sent_vec.csv",  drac_sent_vecs, ',')
writedlm( "drac_sentences.csv",  sentences, ',')

In [26]:
sentences=readdlm("drac_sentences.csv", '!', String, header=false)
drac_sent_vecs=readdlm("drac_sent_vec.csv", ',', Float32, header=false)


8145×50 Array{Float32,2}:
   0.395886     0.136462     0.0393325   …   -0.00172208   -0.094155  
   0.105341     0.298508    -0.108769        -0.11237       0.108809  
   0.306499     0.372668     0.0499599        0.011585     -0.0269931 
   0.439134     0.237768    -0.157471        -0.047655     -0.206138  
   0.479465     0.0339237    0.0574679       -0.0110334    -0.0810052 
   0.305005     0.236101    -0.167058    …   -0.161612     -0.481633  
   0.274253    -0.103281    -0.0939105       -0.0443089    -0.0691436 
   0.454941     0.308015    -0.376682         0.118407     -0.017146  
   0.280243     0.0355603   -0.371213        -0.054871      0.0895917 
   0.303624     0.24452     -0.259576        -0.0073874     0.372042  
   0.292713     0.0700706   -0.128396    …   -0.0598984     0.0768687 
   0.427364     0.0626689   -0.00844564      -0.0528361     0.20124   
   0.42247      0.139159    -0.134028        -0.109309     -0.322777  
   ⋮                                     ⋱         

In [27]:
function closest_sent_pretrained(pretrained_arr, input_str, n=20)
    mean_vec_input=mean([vec(w) for w in split(input_str)])
    list=[(x,cosine(mean_vec_input, pretrained_arr[x,:])) for x in 1:length(sentences)]
    topn_idx=sort(list, by = x -> x[2], rev=true)[1:n]
    return [sentences[a] for (a,_) in topn_idx]
end

closest_sent_pretrained (generic function with 2 methods)

In [28]:
closest_sent_pretrained(drac_sent_vecs, "i walked into a door")

20-element Array{String,1}:
 "with a glad heart i opened my door and ran down to the hall"                                                                                                                                                                                   
 "i held my door open as he went away and watched him go into his room and close the door"                                                                                                                                                       
 "again a shock: my door was fastened on the outside"                                                                                                                                                                                            
 "suddenly he called out:-- look madam mina look look i sprang up and stood beside him on the rock he handed me his glasses and pointed"                                                                                                         
 "th

MethodError: MethodError: no method matching vec(::SubString{String})
Closest candidates are:
  vec(!Matched::LinearAlgebra.Transpose{T,#s627} where #s627<:(AbstractArray{T,1} where T) where T) at /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.3/LinearAlgebra/src/adjtrans.jl:201
  vec(!Matched::SparseArrays.AbstractSparseArray{Tv,Ti,1} where Ti where Tv) at /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.3/SparseArrays/src/sparsevector.jl:913
  vec(!Matched::AbstractArray{T,1} where T) at abstractarraymath.jl:42
  ...