# Explore the Gensim implementation

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [4]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

## Similarity

In [None]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

In [None]:
for x, y in wv.most_similar('car'):
    print(x, y)

## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [None]:
wv.most_similar(positive=['King', 'woman'], negative=['man'])

## Not matching

In [None]:
wv.doesnt_match("school professor apple student".split())

## Mean

In [None]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [None]:
wv.similar_by_vector(m)

In [None]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

## Context

In [None]:
wv.most_similar('buy')

In [None]:
wv.similarity('buy', 'money')

## Train a custom model

In [None]:
import gensim.models

In [None]:
sentences = _ # assume there's one document per line, tokens separated by whitespace
model = gensim.models.Word2Vec(sentences=sentences)

## Exercise: train a model from wordnet

In [None]:
from nltk.corpus import wordnet as wn
import nltk

In [None]:
words = ['cat', 'dog', 'bird', 'fish']

In [None]:
h = lambda s: s.hypernyms()
p = lambda s: s.hyponyms()

def get_pseudo_sentences(word, context=3):
    sentences = []
    for s in wn.synsets(word):
        for lemma in s.lemmas():
            sentences.append([lemma.name(), s.name()])
        for i, j in enumerate(s.closure(h)):
            sentences.append([s.name(), j.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
        for i, j in enumerate(s.closure(p)):
            sentences.append([j.name(), s.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
    return sentences

In [None]:
sentences = []
for w in words:
    sentences += get_pseudo_sentences(w)

In [None]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
model.wv.most_similar('fish')

## Update an existing model

In [None]:
import pymongo
import nltk
from string import punctuation
import copy

In [None]:
MO = Word2Vec.load('/Users/flint/Playground/MeaningSpread/w2v-global.model')

In [None]:
MO.wv.most_similar('fear')

In [None]:
db = pymongo.MongoClient()['twitter']['tweets']

In [None]:
tweets = list(db.find())

In [None]:
corpus = dict([(tweet['id'], tweet['text']) for tweet in tweets])

In [None]:
nltk_tokenize = lambda text: [x.lower() for x in nltk.word_tokenize(text) if x not in punctuation]

In [None]:
data = [nltk_tokenize(y) for x, y in corpus.items()]

In [None]:
M1 = copy.deepcopy(MO)

In [None]:
M1.train(data, total_examples=MO.corpus_count, epochs=MO.epochs)

In [None]:
M1.wv.most_similar('pandemic')