In [1]:
## Based on material at:
# - https://jonreeve.com/2019/02/workshop-word-embeddings/
# - https://spacy.io/usage/linguistic-features#vectors-similarity

In [2]:
# Import libs
import spacy
from numpy import dot
from numpy.linalg import norm

In [3]:
# Vector representations are available in medium and large packages
nlp = spacy.load('en_core_web_lg')

In [4]:
# A function to define similarity
def similarity(vecA, vecB):
    return dot(vecA, vecB) / (norm(vecA, ord=2) * norm(vecB, ord=2))

In [5]:
# A function to find similar words greedily. Starting with a base threshold
def mostSimilar(vec):
    highestSimilarities = [0.4]
    highestWords = [""]
    for w in nlp.vocab:
        sim = similarity(vec, w.vector)
        if sim > highestSimilarities[-1]:
            highestSimilarities.append(sim)
            highestWords.append(w.text.lower())
    return list(zip(highestWords, highestSimilarities))[-10:]

In [6]:
# Find vector representations
doc = nlp('queen king woman man')
queen, king, woman, man = doc[0].vector, doc[1].vector, doc[2].vector, doc[3].vector

In [7]:
# Verify
mostSimilar(king)

  This is separate from the ipykernel package so we can avoid doing imports until


[('', 0.4), ('man', 0.40884617), ('king', 1.0000001)]

In [8]:
# Verify
mostSimilar(queen)

  This is separate from the ipykernel package so we can avoid doing imports until


[('', 0.4), ('she', 0.40245172), ('king', 0.7252611), ('queen', 1.0000001)]

In [9]:
# Word embedding analogies. See latest results at:
# - https://kawine.github.io/blog/nlp/2019/06/21/word-analogies.html

In [10]:
# answer: queen - woman + man
x, a, b, y = [nlp(w).vector for w in ['queen', 'woman', 'man', 'king']]
answer = x - a + b
mostSimilar(answer)

  This is separate from the ipykernel package so we can avoid doing imports until


[('', 0.4), ('man', 0.41803107), ('king', 0.7716142), ('queen', 0.77542514)]

In [11]:
# We can also compare analogy with direct result  
dist = similarity(answer, y)
print (f" Distance between two pairs is {dist}.")

 Distance between two pairs is 0.7716141939163208.


In [12]:
# We do for country and capitals
# y = x - a + b
x, a, b, y = [nlp(w).vector for w in ['india', 'france', 'paris', 'new delhi']]
answer = x - a + b
mostSimilar(answer)

  This is separate from the ipykernel package so we can avoid doing imports until


[('', 0.4), ('delhi', 0.7410936), ('india', 0.8412354)]

In [13]:
# We can also compare analogy with direct result  
dist = similarity(answer, y)
print (f" Distance between two pairs is {dist}.")

 Distance between two pairs is 0.6739897727966309.


In [14]:
# We do for religion and places of worship
# y = x - a + b
x, a, b, y = [nlp(w).vector for w in ['christian', 'hindu', 'temple', 'church']]
answer = x - a + b
mostSimilar(answer)

  This is separate from the ipykernel package so we can avoid doing imports until


[('', 0.4), ('christian', 0.61638075), ('temple', 0.65175486)]

In [15]:
# We can also compare analogy with direct result  
dist = similarity(answer, y)
print (f" Distance between two pairs is {dist}.")

 Distance between two pairs is 0.6455529928207397.


In [16]:
# Stats about Spacy's vocab
len(nlp.vocab)

502

In [17]:
# Another way
nlp.meta['vectors']

{'width': 300,
 'vectors': 684831,
 'keys': 684830,
 'name': 'en_core_web_lg.vectors'}