In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.decomposition import PCA
import gensim.downloader as api
from gensim.models import KeyedVectors

For looking at vectors, we are using Gensim. Its a package (not a deep learning package) for word and text similarity modelling, which started with LDA_style topic models and grew into SDV and neural word represenations. It's efficient and scalable. Provide several sets of word vectors that we can easily load.

In [2]:
model = api.load("glove-wiki-gigaword-100")  
print(type(model))  

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [3]:
model['bread']

array([-0.66146  ,  0.94335  , -0.72214  ,  0.17403  , -0.42524  ,
        0.36303  ,  1.0135   , -0.14802  ,  0.25817  , -0.20326  ,
       -0.64338  ,  0.16632  ,  0.61518  ,  1.397    , -0.094506 ,
        0.0041843, -0.18976  , -0.55421  , -0.39371  , -0.22501  ,
       -0.34643  ,  0.32076  ,  0.34395  , -0.7034   ,  0.23932  ,
        0.69951  , -0.16461  , -0.31819  , -0.34034  , -0.44906  ,
       -0.069667 ,  0.35348  ,  0.17498  , -0.95057  , -0.2209   ,
        1.0647   ,  0.23231  ,  0.32569  ,  0.47662  , -1.1206   ,
        0.28168  , -0.75172  , -0.54654  , -0.66337  ,  0.34804  ,
       -0.69058  , -0.77092  , -0.40167  , -0.069351 , -0.049238 ,
       -0.39351  ,  0.16735  , -0.14512  ,  1.0083   , -1.0608   ,
       -0.87314  , -0.29339  ,  0.68278  ,  0.61634  , -0.088844 ,
        0.88094  ,  0.099809 , -0.27161  , -0.58026  ,  0.50364  ,
       -0.93814  ,  0.67576  , -0.43124  , -0.10517  , -1.2404   ,
       -0.74353  ,  0.28637  ,  0.29012  ,  0.89377  ,  0.6740

In [4]:
model['croissant']

array([-0.25144  ,  0.52157  , -0.75452  ,  0.28039  , -0.31388  ,
        0.274    ,  1.1971   , -0.10519  ,  0.82544  , -0.33398  ,
       -0.21417  ,  0.22216  ,  0.14982  ,  0.47384  ,  0.41984  ,
        0.69397  , -0.25999  , -0.44414  ,  0.58296  , -0.30851  ,
       -0.076455 ,  0.33468  ,  0.28055  , -0.99012  ,  0.30349  ,
        0.39128  ,  0.031526 , -0.095395 , -0.004745 , -0.81347  ,
        0.27869  , -0.1812   ,  0.14632  , -0.42186  ,  0.13857  ,
        1.139    ,  0.14925  , -0.051459 ,  0.37875  , -0.2613   ,
        0.011081 , -0.28881  , -0.38662  , -0.3135   , -0.1954   ,
        0.19248  , -0.52995  , -0.40674  , -0.25159  ,  0.06272  ,
       -0.32724  ,  0.28374  , -0.2155   , -0.061832 , -0.50134  ,
        0.0093959,  0.30715  ,  0.3873   , -0.74554  , -0.45947  ,
        0.40032  , -0.1378   , -0.26968  , -0.3946   , -0.64876  ,
       -0.47149  , -0.085536 ,  0.092795 , -0.034018 , -0.61906  ,
        0.19123  ,  0.20563  ,  0.29056  , -0.010908 ,  0.1531

In [5]:
model.most_similar('bread')

[('flour', 0.7654520869255066),
 ('baked', 0.7607272267341614),
 ('cake', 0.7605516910552979),
 ('loaf', 0.7457114458084106),
 ('toast', 0.7397798299789429),
 ('cheese', 0.7374635338783264),
 ('potato', 0.7367485165596008),
 ('butter', 0.7279618978500366),
 ('potatoes', 0.7085272669792175),
 ('pasta', 0.7071877717971802)]

In [6]:
model.most_similar('cake')

[('cakes', 0.8047241568565369),
 ('chocolate', 0.7893112301826477),
 ('pie', 0.7703179717063904),
 ('dessert', 0.7649706602096558),
 ('bread', 0.7605515718460083),
 ('frosting', 0.7601189017295837),
 ('cookie', 0.7544960975646973),
 ('cookies', 0.7488499283790588),
 ('baked', 0.7466464638710022),
 ('pudding', 0.7463829517364502)]

In [7]:
model.most_similar(negative='school')

[('ileus', 0.6114288568496704),
 ('suparman', 0.6036075949668884),
 ('agcaoili', 0.5940601825714111),
 ('ionita', 0.5873614549636841),
 ('nassef', 0.5831325054168701),
 ('bashirov', 0.5810243487358093),
 ('rozana', 0.5782657265663147),
 ('iddrisu', 0.5752978920936584),
 ('zety', 0.5747663378715515),
 ('warraich', 0.5742456912994385)]

In [8]:
result = model.most_similar(positive=['women', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.5805


In [9]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

# Example
# From King component you are subtracting man and adding women to get Queen
# (King - Man) + Woment = Queen

In [10]:
analogy('man','women','king')

'queen'

In [11]:
analogy('japan', 'japanese', 'australia')

'australian'

In [13]:
analogy('australia', 'beer', 'france')

'champagne'

In [14]:
analogy('tall', 'tallest', 'long')

'longest'

In [12]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

cereal
