In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [4]:
nlp(u'The quick brown fox jumped').vector

array([-2.09217995e-01, -2.78227981e-02, -3.57064009e-02,  1.55218393e-01,
       -1.28050027e-02,  1.31627038e-01, -1.99465990e-01,  4.75811996e-02,
        1.26798794e-01,  1.64792800e+00, -3.57592016e-01, -1.39875397e-01,
       -1.26122087e-02, -2.02728346e-01, -2.25237608e-01,  2.15431936e-02,
        7.78958052e-02,  9.29676056e-01, -2.75549982e-02, -3.71005982e-01,
       -1.42800003e-01, -3.66641544e-02, -1.07376035e-02, -1.84352830e-01,
        2.29006782e-02, -5.17717972e-02, -2.78652012e-01, -1.19738199e-01,
        5.10960072e-03, -2.85990000e-01, -1.58261746e-01,  2.96241999e-01,
        1.09597601e-01, -4.18331996e-02,  1.87256075e-02, -1.03439607e-01,
       -5.10879979e-02, -3.51091917e-03, -6.81461841e-02, -2.05657601e-01,
        1.66347414e-01, -9.31599736e-03, -4.61134054e-02, -1.05457589e-01,
        2.31313989e-01,  1.80005193e-01, -2.06444815e-01, -1.37050152e-02,
        1.70106202e-01, -2.19812002e-02, -2.14003205e-01,  1.07415602e-01,
       -2.80592032e-02, -

In [5]:
nlp(u'The quick brown fox jumped').vector.shape # vector's dimension

(300,)

In [6]:
nlp(u'fox').vector.shape

(300,)

In [7]:
tokens = nlp(u'lion cat pet')

In [8]:
# similarities between tokens
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.5265437
lion pet 0.39923772
cat lion 0.5265437
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923772
pet cat 0.7505456
pet pet 1.0


In [9]:
# different words in similar context may also have relationships
tokens = nlp(u'like love hate')

In [11]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.65790397
like hate 0.6574652
love like 0.65790397
love love 1.0
love hate 0.6393099
hate like 0.6574652
hate love 0.6393099
hate hate 1.0


In [12]:
nlp.vocab.vectors.shape # 684000 for every 300 dimension in each particular word vector

(684831, 300)

In [20]:
tokens = nlp(u'dog cat Abel')

In [21]:
# oov = Out Of Vocabulary
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
Abel True 5.740055 False


In [22]:
# calculate cosine similiarity
from scipy import spatial

cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [23]:
# grab some vectors
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [24]:
# king - man + woman ---> NEW_VECTOR similar Queen, princess, highness

In [25]:
new_vector = king - man + woman

In [26]:
computed_similarities = []

# FOR ALL WORDS IN MY VOCAB
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

In [27]:
# sort in descending order (-item) by similarity value (-item[1])
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])

In [28]:
# print first 10 items
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'prince', 'kings', 'princess', 'royal', 'throne', 'queens', 'monarch', 'kingdom']
