In [1]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn 
from scipy import spatial

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
cosine_similarity_custom = lambda x,y: 1 - spatial.distance.cosine(x,y)

In [4]:
def word_vector_arithmetic(words_list, n):
    words_vec_list = []
    for word in words_list:
        words_vec_list.append(nlp.vocab[word].vector)
    new_vector = words_vec_list[0] - words_vec_list[1] + words_vec_list[2]
    
    similarities_sklearn = []
    similarities_custom = []
    for word in nlp.vocab:
        if (word.has_vector and word.is_lower and word.is_alpha):
            cs_sklearn = cosine_similarity_sklearn(new_vector.reshape(1, -1), word.vector.reshape(1, -1))
            cs_custom = cosine_similarity_custom(new_vector, word.vector)
            similarities_sklearn.append((word, cs_sklearn))
            similarities_custom.append((word, cs_custom))
    
    similarities_sklearn = sorted(similarities_sklearn, key=lambda item: item[1], reverse=True)
    similarities_custom = sorted(similarities_custom, key=lambda item: item[1], reverse=True)
    
    most_similar_sklearn = [(word[0].text, word[1][0][0]) for word in similarities_sklearn[:n]]
    most_similar_custom = [(word[0].text, word[1]) for word in similarities_custom[:n]]
    
    print(f'The {n} most similar words based on sklearn cosine similarities are:\n\n{most_similar_sklearn}')
    print('\n-----------------------------------------\n')
    print(f'The {n} most similar words based on the custom built cosine similarities are:\n\n{most_similar_custom}')

In [5]:
word_vector_arithmetic(['actor', 'man', 'woman'], 10)

The 10 most similar words based on sklearn cosine similarities are:

[('actress', 0.87436426), ('actor', 0.82521105), ('actresses', 0.70501995), ('actors', 0.68155503), ('starred', 0.62244534), ('starring', 0.6142798), ('portrayed', 0.5630437), ('woman', 0.56105983), ('celebrity', 0.55670214), ('star', 0.5485102)]

-----------------------------------------

The 10 most similar words based on the custom built cosine similarities are:

[('actress', 0.8743642568588257), ('actor', 0.8252111077308655), ('actresses', 0.7050201296806335), ('actors', 0.6815550327301025), ('starred', 0.6224454641342163), ('starring', 0.6142799258232117), ('portrayed', 0.5630437731742859), ('woman', 0.5610598921775818), ('celebrity', 0.5567022562026978), ('star', 0.5485102534294128)]
