# Lecture 6: Gensim

In [2]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

Stanford offering is GloVe word vectors. 
Gensim doesn't give them first class support, but allows you to convert a file of GloVe vectors into word2vec format. 
You can download the GloVe vectors from [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

- We use the 100d vectors below as a mix between speed and smallness vs. quality. 
- If you try out the 50d vectors, they basically work for similarity but clearly aren't as good for analogy problems. 
- If you load the 300d vectors, they're even better than the 100d vectors.

In [None]:
path = '/Users/yeabinmoon/Documents/glove/'

In [3]:
path = '/Users/yeabinmoon/Documents/glove/'
glove_file = datapath(path + 'glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


(400000, 100)

In [4]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [5]:
model.most_similar('obama')

[('barack', 0.937216579914093),
 ('bush', 0.9272855520248413),
 ('clinton', 0.896000325679779),
 ('mccain', 0.8875634074211121),
 ('gore', 0.8000320196151733),
 ('hillary', 0.7933663129806519),
 ('dole', 0.7851963639259338),
 ('rodham', 0.7518897652626038),
 ('romney', 0.7488929629325867),
 ('kerry', 0.7472624182701111)]

In [6]:
model.most_similar('banana')

[('coconut', 0.7097253799438477),
 ('mango', 0.7054824829101562),
 ('bananas', 0.6887733936309814),
 ('potato', 0.6629636883735657),
 ('pineapple', 0.6534532308578491),
 ('fruit', 0.6519854068756104),
 ('peanut', 0.6420576572418213),
 ('pecan', 0.6349173188209534),
 ('cashew', 0.6294420957565308),
 ('papaya', 0.6246591210365295)]

In [7]:
model.most_similar(negative='banana')

[('shunichi', 0.49618104100227356),
 ('ieronymos', 0.4736502170562744),
 ('pengrowth', 0.4668096601963043),
 ('höss', 0.4636845886707306),
 ('damaskinos', 0.46178486943244934),
 ('yadin', 0.4617375135421753),
 ('hundertwasser', 0.458895742893219),
 ('ncpa', 0.4577339291572571),
 ('maccormac', 0.45661094784736633),
 ('rothfeld', 0.4523947536945343)]

In [8]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.7699


In [9]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [10]:
analogy('japan', 'japanese', 'australia')

'australian'

In [11]:
analogy('australia', 'beer', 'france')

'champagne'

In [12]:
analogy('obama', 'clinton', 'reagan')

'nixon'

In [13]:
analogy('tall', 'tallest', 'long')

'longest'

In [14]:
analogy('good', 'fantastic', 'bad')

'terrible'

In [15]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

cereal
