In [4]:
#You must have SpaCy installed to use this. 
#Note, we use the largest model of SpaCy for improved accuracy and the largest possible vectors

import spacy
import numpy as np
from scipy.spatial import distance

nlp = spacy.load('en_core_web_lg')

In [431]:
#This script prints the similarity scores, as a matrix, of the tokens you have inserted.
#You can use as many tokens as you would like. For the similarity scores used in the analysis, we only used two.
#Tokens are just words
tokens = nlp (u'necklace jewelry')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

necklace necklace 1.0
necklace jewelry 0.709219217300415
jewelry necklace 0.709219217300415
jewelry jewelry 1.0


In [525]:
#This script presents the closest 50 words within the tokens vector space. This is how we derive semantic neighbors.
#To derive the 50 closest words, we used words that had complete vectors. Words with empty vectors are low frequency and were not used.
#We also did not use 'compound words'. These are defined as words that appear as a neighbor but are just combinations of real English words, and not actually real English words. 
#For example, stepmother is not a compound word. However, myfriend is. Both appear within the 50 closest neighbors for the word 'wife'
#Similarly, compound words frequently have empty vectors.
#We also avoided proper nouns, except where the word has became a proprietary eponym. An example is within the neighbor list for aspirin, where words like Tylenol and Motrin appear.
#Lastly, frequently, the first 50 words within the vector did not meet these requirements, so the vector size/n= number had to be expanded.
#This is likely due to, in some part, the size of the SpaCy model used. By using the largest library, MANY more words are contained within each vector.
#However, we did not want to introduce a confound where we changed library size to search for neighbors. 

your_word = "necklace"

ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=50)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)

['wife', 'mywife', 'wife-', 'husband', 'wifey', 'wife--', 'daughter', "husband'll", 'daughter--', 'daughter-', 'husband-', 'goddaughter', 'Alewife', 'stepdaughter', 'daughtersinlaw', 'husbandman', 'granddaughter', 'alewife', 'grandaughter', 'wifely', 'husband--', 'hotwife', 'daughterinlaw', 'Stepdaughter', 'husbandly', 'Hotwife', 'Dibnah', 'widow', 'husbandmen', 'cousin', 'girlfriend', 'Granddaughter', 'sister', 'Younghusband', "wife'll", 'fiancée', 'Kroeber', 'wifes', 'granddaughters', "father'll", 'daughters', 'mother', 'father-', 'fiancées', 'fiancés', 'father`s', 'son', "father'd", 'niece', 'boyfriend', 'father', 'fiancé', 'sister--', 'fatherwas', 'sister-', 'stepdaughters', 'fiancè', 'fatherinlaw', 'grandmother', 'fianc', 'ladyfriend', 'father--', 'myfriend', 'girlfriend-', 'husbands', 'fiancee', 'girlfriend--', 'stepfather', 'daugher', 'fiancees', 'aunt', 'father"s', 'stepmother', 'househusband', 'grandchild', 'Husband', 'grandmotherly', 'daughterCredit', 'fianceé', 'Befriend', "

In [10]:
#This script will find the closest neighbor to a given word. It is nearly always the word itself. 
#So, you can use this script to just print the word 'frog' in a complicated way, if you'd like.
input_word = "frog"
p = np.array([nlp.vocab[input_word].vector])
ids = [x for x in nlp.vocab.vectors.keys()]
vectors = [nlp.vocab.vectors[x] for x in ids]
vectors = np.array(vectors)
closest_index = distance.cdist(p, vectors).argmin()
word_id = ids[closest_index]
output_word = nlp.vocab[word_id].text
print(output_word)

frog


In [12]:
#This script is very similar to the 2nd, in that it finds a words closest neighbors
#The only difference is it will not be accessing it directly from the vector
word = 'jewelry'
nwords = 10
doc = nlp(word)
vector = doc.vector
vect2word = lambda idx: nlp.vocab.strings[idx]
print([vect2word(simword) for simword in nlp.vocab.vectors.most_similar(vector.reshape(1,-1), n=nwords)[0][0]])

['jewelry', 'jewellery', 'jewellry', 'jewelery', 'jewelries', 'jeweils', 'jeweller', 'jeweler', 'jewellers', 'jeweled']
