# Explore the Gensim implementation
> Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., & Joulin, A. (2017). Advances in pre-training distributed word representations. arXiv preprint arXiv:1712.09405.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [4]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

## Similarity

In [None]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

In [None]:
for x, y in wv.most_similar('car'):
    print(x, y)

In [None]:
vectors = []
for word in ['car', 'minivan', 'bicycle', 'airplane']:
    vectors.append(wv.get_vector(word))
V = np.array(vectors)

In [None]:
v = V.mean(axis=0)
v = v - wv.get_vector('car')

In [None]:
wv.similar_by_vector(v)

## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [None]:
wv.most_similar(positive=['King', 'woman'], negative=['man'])

## Not matching

In [None]:
wv.doesnt_match("school professor apple student".split())

## Mean

In [None]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [None]:
wv.similar_by_vector(m)

In [None]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
    ('lecturer', 'teacher'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

## Context

In [None]:
wv.most_similar('buy')

In [None]:
wv.similarity('buy', 'money')

## Train a custom model

In [5]:
import gensim.models

In [None]:
sentences = _ # assume there's one document per line, tokens separated by whitespace
model = gensim.models.Word2Vec(sentences=sentences)

## Update an existing model

In [6]:
import pymongo
import nltk
from string import punctuation
import copy

In [7]:
MO = gensim.models.Word2Vec.load('/Users/flint/Playground/MeaningSpread/w2v-global.model')

In [20]:
MO.wv.most_similar('pandemic')

[('influenza', 0.7046176195144653),
 ('h1n1', 0.6910238265991211),
 ('outbreak', 0.6785882711410522),
 ('avian', 0.6601735949516296),
 ('flu', 0.6578388214111328),
 ('outbreaks', 0.5860258936882019),
 ('swine', 0.5697133541107178),
 ('pandemics', 0.5689476728439331),
 ('epidemic', 0.552070677280426),
 ('cholera', 0.5491413474082947)]

In [11]:
db = pymongo.MongoClient()['twitter']['tweets']

In [12]:
tweets = list(db.find())

In [13]:
corpus = dict([(tweet['id'], tweet['text']) for tweet in tweets])

In [14]:
nltk_tokenize = lambda text: [x.lower() for x in nltk.word_tokenize(text) if x not in punctuation]

In [15]:
data = [nltk_tokenize(y) for x, y in corpus.items()]

In [16]:
M1 = copy.deepcopy(MO)

In [17]:
M1.train(data, total_examples=MO.corpus_count, epochs=MO.epochs)

(4694460, 6477630)

In [21]:
M1.wv.most_similar('pandemic')

[('h1n1', 0.6190395355224609),
 ('influenza', 0.5918642282485962),
 ('outbreak', 0.5802384614944458),
 ('avian', 0.5463380813598633),
 ('flu', 0.5393428206443787),
 ('swine', 0.4824046492576599),
 ('epidemic', 0.47371941804885864),
 ('outbreaks', 0.46408894658088684),
 ('pandemics', 0.46319717168807983),
 ('h5n1', 0.45786356925964355)]