In [None]:
import gensim.downloader as api
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt




In [None]:
# 1. Load pretrained word vectors (this will download if not already cached)
# Example: "glove-wiki-gigaword-100" (~128 MB)
model = api.load("glove-wiki-gigaword-100")

# Pretrained Models in gensim-data
## Word2Vec

word2vec-google-news-300 → ~1.6 GB, 300d

word2vec-ruscorpora-300 → ~198 MB, 300d (Russian)

## GloVe (Wikipedia + Gigaword)

glove-wiki-gigaword-50 → ~66 MB, 50d

glove-wiki-gigaword-100 → ~128 MB, 100d

glove-wiki-gigaword-200 → ~252 MB, 200d

glove-wiki-gigaword-300 → ~376 MB, 300d

## GloVe (Twitter)

glove-twitter-25 → ~104 MB, 25d

glove-twitter-50 → ~199 MB, 50d

glove-twitter-100 → ~387 MB, 100d

glove-twitter-200 → ~758 MB, 200d

## FastText

fasttext-wiki-news-subwords-300 → ~1.0 GB, 300d

## ConceptNet

conceptnet-numberbatch-17-06-300 → ~1.4 GB, 300d

In [None]:
model.most_similar('obama')

In [None]:
model.most_similar('merkel')

In [None]:
model.most_similar('banana')

In [None]:
model.most_similar('turkey')

In [None]:
model.most_similar('germany')

In [None]:
model.most_similar(negative='banana')

In [None]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [None]:
analogy('japan', 'japanese', 'australia')

In [None]:
analogy('japan', 'japanese', 'turkey')

In [None]:
analogy('japan', 'japanese', 'germany')

In [None]:
analogy('australia', 'beer', 'france')

In [None]:
analogy('australia', 'beer', 'turkey')

In [None]:
analogy('australia', 'beer', 'germany')

In [None]:
analogy('obama', 'clinton', 'reagan')

In [None]:
analogy('tall', 'tallest', 'long')

In [None]:
analogy('good', 'fantastic', 'bad')

In [None]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

In [None]:
import random

In [None]:
random_words = random.sample(sorted(model.key_to_index.keys()), 10)


In [None]:
random_words

In [None]:
random_word = random.choice(sorted(model.key_to_index.keys()))

In [None]:
random_word

In [None]:
# 2. Pick some words to visualize
words = [
"coffee", "tea", "beer", "wine", "brandy", "rum", "champagne", "water",
"spaghetti", "borscht", "hamburger", "pizza", "falafel", "sushi", "meatballs",
"animal", "ape", "cat", "dog", "frog", "horse", "kangaroo", "koala", "lizard", "monkey", "parrot", "toad", "wolf", "wombat",
"france", "germany", "hungary", "luxembourg", "australia", "fiji", "china",
"homework", "assignment", "problem", "exam", "test", "class",
"school", "college", "university", "institute",
"king", "queen", "man", "woman", 
"apple", "orange", "fruit", 
]


In [None]:
#words = random_words

In [None]:
# Extract word vectors
X = [model[word] for word in words]
print(X[0].shape)

In [None]:


# 3. Reduce dimensions with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# 4. Plot
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1])

for i, word in enumerate(words):
    plt.annotate(word, (X_pca[i, 0]+0.02, X_pca[i, 1]+0.02))

plt.title("Word Embeddings visualized with PCA")
plt.show()
