In [33]:
import gensim, logging
from nltk.corpus import stopwords

In [8]:
raw_corpus = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [9]:
raw_corpus

['Human machine interface for lab abc computer applications',
 'A survey of user opinion of computer system response time',
 'The EPS user interface management system',
 'System and human system engineering testing of EPS',
 'Relation of user perceived response time to error measurement',
 'The generation of random binary unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV Widths of trees and well quasi ordering',
 'Graph minors A survey']

In [13]:
stoplist = stopwords.words('english')
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in raw_corpus]

In [15]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [24]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
processed_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [25]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [27]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [28]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
bow_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

In [34]:
model = gensim.models.Word2Vec(processed_corpus, min_count=1)

In [37]:
print(model)
print(model.wv.vocab)

Word2Vec(vocab=12, size=100, alpha=0.025)
{'human': <gensim.models.keyedvectors.Vocab object at 0x7f6f0314b2e8>, 'interface': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba390>, 'computer': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba0b8>, 'survey': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba0f0>, 'user': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba278>, 'system': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba048>, 'response': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba240>, 'time': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba438>, 'eps': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba160>, 'trees': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba198>, 'graph': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba400>, 'minors': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba5f8>}


In [66]:
# build the same model, making the 2 steps explicit
new_model = gensim.models.Word2Vec(min_count=1, size=100, workers=8)  # an empty model, no training
# Cython necessary for workers
new_model.build_vocab(processed_corpus)                 # can be a non-repeatable, 1-pass generator     
new_model.train(processed_corpus, total_examples=new_model.corpus_count, epochs=new_model.epochs)                       
# can be a non-repeatable, 1-pass generator
print(new_model)
print(model.wv.vocab)

Word2Vec(vocab=12, size=100, alpha=0.025)
{'human': <gensim.models.keyedvectors.Vocab object at 0x7f6f0314b2e8>, 'interface': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba390>, 'computer': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba0b8>, 'survey': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba0f0>, 'user': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba278>, 'system': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba048>, 'response': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba240>, 'time': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba438>, 'eps': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba160>, 'trees': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba198>, 'graph': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba400>, 'minors': <gensim.models.keyedvectors.Vocab object at 0x7f6eff8ba5f8>}


In [None]:
gensi

In [50]:
processed_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [59]:
model.wv.most_similar(positive=['user', 'interface'], negative=['system'], topn=1)

[('graph', 0.11549252271652222)]

In [65]:
model.wv.doesnt_match(['human', 'computer', 'interface', 'eps'])

'eps'

In [63]:
model.wv.similarity('human', 'computer')

0.18686399

In [64]:
model.wv.__getitem__(['human'])

array([[-4.0587825e-03,  1.1249031e-03, -2.9031246e-03, -1.8892077e-03,
         4.3165316e-03,  1.1051432e-03, -3.1905172e-03,  4.4847623e-04,
        -4.8446646e-03, -4.3259334e-04,  4.9787569e-03,  3.4899055e-03,
        -3.4545094e-03,  9.2180556e-04,  3.9607901e-04, -1.1231720e-03,
        -2.0145506e-03,  4.1078241e-03,  4.2786216e-03,  4.5254658e-04,
        -8.2720554e-04, -1.5779483e-03, -3.6928593e-03, -4.7285808e-03,
         2.4421031e-03,  2.7082621e-03,  1.3680877e-03, -3.8699103e-03,
        -3.8043386e-04, -5.3821155e-04,  2.4720361e-03,  4.7600213e-03,
        -4.8120287e-03, -1.1803948e-03, -3.8195967e-03, -2.8486869e-03,
        -2.0567380e-04,  3.2563363e-03,  4.3201186e-03,  1.0261493e-03,
         7.2132825e-04,  4.1255248e-03, -2.4629647e-03, -3.8154440e-03,
        -3.9499160e-03, -2.6114751e-03, -3.2222935e-03,  1.5364909e-03,
        -5.4176070e-04,  3.5467469e-03, -3.0257443e-03,  4.2660404e-03,
        -2.7679051e-03, -4.0529710e-03, -2.7858064e-04,  1.77315

In [68]:
import gensim.downloader as api

In [70]:
model = api.load('glove-twitter-25')



In [82]:
model.wv.most_similar(positive=['australia', 'beer'], negative=['france'], topn=5)

  """Entry point for launching an IPython kernel.


[('coffee', 0.7858877778053284),
 ('wine', 0.755984902381897),
 ('food', 0.7507694363594055),
 ('drinks', 0.7365005612373352),
 ('ice', 0.7315662503242493)]

In [84]:
model.wv.most_similar(positive=['asia', 'russia'], negative=['europe'], topn=5)

  """Entry point for launching an IPython kernel.


[('australia', 0.8525853157043457),
 ('vietnam', 0.8469770550727844),
 ('india', 0.8334696888923645),
 ('thailand', 0.8263822197914124),
 ('china', 0.8161506652832031)]

In [79]:
model.wv.doesnt_match(['beer', 'wine', 'coffee', 'whisky'])

  """Entry point for launching an IPython kernel.


'whisky'

In [80]:
model.wv.similarity('asia', 'nepal')

  """Entry point for launching an IPython kernel.


0.7246317

In [81]:
model.wv.__getitem__(['nepal'])

  """Entry point for launching an IPython kernel.


array([[-0.43584  , -0.33878  , -0.34588  , -0.98574  , -1.3136   ,
        -0.53908  , -0.11976  , -0.48904  ,  1.1536   ,  0.51499  ,
        -0.14065  , -0.33521  , -1.1437   ,  0.65969  ,  0.62388  ,
        -0.06944  ,  0.10791  , -0.0072232,  0.49554  ,  0.47091  ,
        -0.37258  , -0.30288  , -0.50675  , -1.3789   , -0.34021  ]],
      dtype=float32)