In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [133]:
#thesaurus = ["location","site","store","business","retail"]
thesaurus = ["groups","demographics","financial","permissions","stores","location","city"]
query_word = "Boston" #spend, #cost
doc = nlp(query_word)

In [134]:
#Match with the words in the thesaurus
for d in doc:
    pass
similarity = [(t, d.similarity(nlp(t))) for t in thesaurus]
sorted_similarity = sorted(similarity, key = lambda x: x[1], reverse = True)
for s in sorted_similarity:
    print (s)

('city', 0.33055230018131787)
('stores', 0.20863909035544678)
('financial', 0.20336634107401494)
('demographics', 0.19947616966692344)
('location', 0.16831890841798733)
('groups', 0.1422418962597447)
('permissions', -0.001401980061050915)


In [135]:
#Find similar words from Spacy Vocab
for d in doc:
    pass
similarity = [(w, d.similarity(w)) for w in d.vocab if w.is_lower == d.is_lower and w.prob >= -15 and d.similarity(w) >= 0.6]
sorted_similarity = sorted(similarity, key = lambda x: x[1], reverse = True)
for s in sorted_similarity[:5]:
    print ((s[0].text,s[1]))

('Boston', 1.0)
('Chicago', 0.8021085)
('Seattle', 0.75405705)
('Baltimore', 0.7518922)
('Philadelphia', 0.75111264)


In [113]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [126]:
query_word = "Boston"
doc = nlp(query_word)

In [127]:
#vectorize the query
query_vector = np.array([])
for token in doc:
    word_vector = token.vector
    word_vector = word_vector.reshape(1,300)
    if query_vector.size == 0:
        query_vector = word_vector
    else:
        query_vector = np.concatenate([query_vector,word_vector],axis = 0)
query_vector_mean = query_vector.mean(axis = 0)
print(query_vector_mean)

[-2.1798e-01 -2.9048e-01 -5.1610e-02 -2.4831e-01  9.4785e-01  1.2750e-01
 -1.0110e-01  1.5643e-01  7.9737e-01  8.4553e-01 -9.3753e-01 -3.0159e-01
 -2.8020e-01  5.1649e-02 -4.4898e-01  1.6674e-01  2.3522e-01  1.1031e+00
  7.9818e-02 -1.6015e-01  6.0658e-01 -1.3770e-01  2.4351e-01 -5.6038e-01
 -2.6251e-01 -1.0581e-01 -5.8782e-01  3.0618e-01  7.2418e-02  1.1522e-01
  3.9131e-02  3.2026e-01  2.1002e-01  5.8079e-01  3.4478e-02 -2.8583e-01
  7.7341e-02 -5.4439e-02 -5.3835e-02  1.1094e-01 -1.4411e-01  2.1762e-01
  4.2380e-02  1.5384e-01 -6.6202e-02 -1.7764e-02  6.4653e-02  5.1204e-02
 -4.9649e-04 -9.5064e-02 -4.8792e-01  1.3271e-01  1.0807e-01 -2.0808e-01
 -3.9541e-01  7.7592e-02  2.6452e-01  3.2337e-01  6.0399e-02 -2.2518e-01
 -7.2460e-01 -4.7022e-01 -2.1260e-01 -9.4484e-02 -2.1484e-01 -9.0608e-02
 -5.2059e-01  5.2425e-01 -4.8971e-01  5.8252e-02 -3.7937e-01  1.7053e-01
 -1.7469e-01 -1.4227e-01  6.5785e-01 -4.4157e-02 -4.2401e-01  1.6576e-01
 -1.6306e-01  5.0224e-01 -1.9844e-01  9.7174e-01  2

In [128]:
#Match query vector with vectors of thesaurus
similarity = [(word
                  , np.asscalar(cosine_similarity(query_vector_mean.reshape(1,300),nlp(word).vector.reshape(1,300)))) 
                 for word in thesaurus]
sorted_similarity = sorted(similarity, key = lambda x: x[1], reverse = True)
for s in sorted_similarity:
    print (s)

('stores', 0.20863908529281616)
('financial', 0.20336630940437317)
('demographics', 0.19947616755962372)
('groups', 0.14224185049533844)
('permissions', -0.0014019889058545232)


In [129]:
#Generate similar words from spacy vocab
similarity = [(word.text.lower()
               , np.asscalar(cosine_similarity(query_vector_mean.reshape(1,300),word.vector.reshape(1,300)))) 
              for word in nlp.vocab if word.prob >= -15 and not word.is_stop]
sorted_similarity = sorted(list(set(similarity)), key = lambda x: x[1], reverse = True)
for s in sorted_similarity[:5]:
    print (s)

('boston', 1.0000001192092896)
('chicago', 0.8021084070205688)
('seattle', 0.7540571093559265)
('baltimore', 0.7518922090530396)
('philadelphia', 0.7511126399040222)


In [137]:
type(query_vector)

numpy.ndarray

In [146]:
np.divide(query_vector,np.asarray(5))

array([[-4.35959995e-02, -5.80959991e-02, -1.03219999e-02,
        -4.96620014e-02,  1.89569995e-01,  2.54999995e-02,
        -2.02200003e-02,  3.12860012e-02,  1.59474000e-01,
         1.69105992e-01, -1.87505990e-01, -6.03180006e-02,
        -5.60400002e-02,  1.03297997e-02, -8.97959992e-02,
         3.33480015e-02,  4.70440015e-02,  2.20619991e-01,
         1.59636009e-02, -3.20300013e-02,  1.21316001e-01,
        -2.75400020e-02,  4.87019978e-02, -1.12075999e-01,
        -5.25019988e-02, -2.11619996e-02, -1.17564000e-01,
         6.12360016e-02,  1.44835990e-02,  2.30440013e-02,
         7.82619976e-03,  6.40520006e-02,  4.20040004e-02,
         1.16157994e-01,  6.89560035e-03, -5.71659990e-02,
         1.54681997e-02, -1.08877998e-02, -1.07669998e-02,
         2.21880004e-02, -2.88219992e-02,  4.35240008e-02,
         8.47600028e-03,  3.07680015e-02, -1.32403998e-02,
        -3.55280004e-03,  1.29306000e-02,  1.02407997e-02,
        -9.92979985e-05, -1.90127995e-02, -9.75839943e-0