In [1]:
# Instead of the GoogleNews-vectors-negative300-SLIM.bin in R, which we downloaded locally
# We will now use the GloVe model from gensim, which you do not need to download but simply load from the cloud

# Install the GloVe model using the gensim downloader
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-300")

In [3]:
# See how many words are in the model
print(len(model.key_to_index)) # 400000 words
print(model.vector_size) # 300 dimensions (aka, each word is represented by 300 numbers)

# See what's the vector for the word "man"
# Only showing the first 10 numbers of the vector
print(model["man"][:10])

# See the most similar three words to "king"
print(model.most_similar("king", topn=3))

400000
300
[-0.29784   -0.13255   -0.14505   -0.22752   -0.027429   0.11005
 -0.039245  -0.0089607 -0.18866   -1.1213   ]
[('queen', 0.6336469054222107), ('prince', 0.6196622252464294), ('monarch', 0.5899620652198792)]


In [4]:
# Find nearest words to a list of words
words = ["democracy", "freedom", "liberty"]
for word in words:
    print(f"Most similar to {word}:", model.most_similar(word, topn=5))

Most similar to democracy: [('freedom', 0.6147063970565796), ('democratic', 0.547559916973114), ('dictatorship', 0.5458731055259705), ('freedoms', 0.5321065783500671), ('democratization', 0.5320093631744385)]
Most similar to freedom: [('freedoms', 0.6644782423973083), ('liberty', 0.6439749002456665), ('democracy', 0.6147063970565796), ('rights', 0.609053373336792), ('liberties', 0.5701642036437988)]
Most similar to liberty: [('freedom', 0.6439749002456665), ('equality', 0.4726463258266449), ('rights', 0.4273204207420349), ('freedoms', 0.42315995693206787), ('democracy', 0.41124075651168823)]


In [None]:
# Let's check relationships between words
# king - man + woman ≈ ?
print(model.most_similar(positive=["king", "woman"], negative=["man"], topn=1))
# Correct!

[('queen', 0.6713277101516724)]


In [None]:
# Let's check relationships between words
# china - beijing + tokyo ≈ ?
print(model.most_similar(positive=["china", "tokyo"], negative=["beijing"], topn=1))
# Correct!

[('japan', 0.7686553597450256)]


In [None]:
# Let's check relationships between words
# tokyo - japan + germany ≈ ?
print(model.most_similar(positive=["tokyo", "germany"], negative=["japan"], topn=1))
# Hmmm...
# If you were thinking about: Tokyo is the capital of Japan, and Berlin is the capital of Germany
# Then the model's answer is wrong
# If you were thinking about: Tokyo is the financial hub of Japan, and Frankfurt is the financial hub of Germany
# Then the model's answer is correct
# This shows that the model captures different relationships based on the context of the words

[('frankfurt', 0.739791989326477)]


In [12]:
# Cosine similarity between "beijing" and "china"
similarity = model.similarity('beijing', 'china')
print(f"Cosine similarity between Beijing and China: {similarity:.3f}")
# Cosine similarity between "beijing" and "chair"
similarity = model.similarity('beijing', 'chair')
print(f"Cosine similarity between Beijing and Chair: {similarity:.3f}")

Cosine similarity between Beijing and China: 0.773
Cosine similarity between Beijing and Chair: 0.104


In [5]:
# check if a word is in the vocabulary
print("🧪" in model)

False
