# Explore the Gensim implementation
> Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., & Joulin, A. (2017). Advances in pre-training distributed word representations. arXiv preprint arXiv:1712.09405.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [4]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

In [5]:
v = wv.get_vector('peach')

In [8]:
wv.most_similar('school')

[('elementary', 0.7868632078170776),
 ('schools', 0.7411909103393555),
 ('shool', 0.6692329049110413),
 ('elementary_schools', 0.6597153544425964),
 ('kindergarten', 0.6529811024665833),
 ('eighth_grade', 0.6488089561462402),
 ('School', 0.6477997303009033),
 ('teacher', 0.63824063539505),
 ('students', 0.6301522850990295),
 ('classroom', 0.6281620264053345)]

## Similarity

In [9]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [10]:
for x, y in wv.most_similar('car'):
    print(x, y)

vehicle 0.7821096181869507
cars 0.7423831224441528
SUV 0.7160962224006653
minivan 0.6907036900520325
truck 0.6735789775848389
Car 0.6677608489990234
Ford_Focus 0.667320191860199
Honda_Civic 0.6626849174499512
Jeep 0.651133120059967
pickup_truck 0.6441438794136047


In [11]:
vectors = []
for word in ['peach', 'apricot', 'strawberry', 'fish', 'meat', 'vegetables', 'milk']:
    vectors.append(wv.get_vector(word))
V = np.array(vectors)

In [12]:
V.shape

(7, 300)

In [13]:
v = V.mean(axis=0)
#v = v - wv.get_vector('car')

In [14]:
wv.similar_by_vector(v, topn=20)

[('strawberry', 0.7836589813232422),
 ('vegetables', 0.7646373510360718),
 ('strawberries', 0.7575767040252686),
 ('peaches', 0.754625141620636),
 ('berries', 0.741344153881073),
 ('tomato', 0.7389938831329346),
 ('apricots', 0.7348976135253906),
 ('tomatoes', 0.7201975584030151),
 ('cherries', 0.719590961933136),
 ('asparagus', 0.7166165709495544),
 ('fruit', 0.7139172554016113),
 ('peach', 0.7131540179252625),
 ('pears', 0.7125092148780823),
 ('Bing_cherries', 0.7114951014518738),
 ('sweet_potatoes', 0.7094361782073975),
 ('vegetable', 0.7081252932548523),
 ('apricot', 0.7071393132209778),
 ('blueberries', 0.7059823870658875),
 ('blueberry', 0.703717052936554),
 ('fresh_figs', 0.7034894227981567)]

## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [16]:
wv.most_similar(positive=['Paris', 'Italy'], negative=['France'])

[('Milan', 0.7222141623497009),
 ('Rome', 0.702830970287323),
 ('Palermo_Sicily', 0.5967570543289185),
 ('Italian', 0.5911272764205933),
 ('Tuscany', 0.5632812976837158),
 ('Bologna', 0.5608358383178711),
 ('Sicily', 0.5596384406089783),
 ('Bologna_Italy', 0.5470058917999268),
 ('Berna_Milan', 0.5464027523994446),
 ('Genoa', 0.5308900475502014)]

## Not matching

In [None]:
wv.doesnt_match("school professor car student".split())

## Mean

In [None]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [None]:
wv.similar_by_vector(m)

In [None]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
    ('lecturer', 'teacher'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

## Context

In [None]:
wv.most_similar('buy')

In [None]:
wv.similarity('buy', 'money')

## Train a custom model

In [None]:
import gensim.models

In [None]:
sentences = _ # assume there's one document per line, tokens separated by whitespace
model = gensim.models.Word2Vec(sentences=sentences)

## Exercise: train a model from your data

In [None]:
import pymongo

In [None]:
db = pymongo.MongoClient()['wikibio']['rawdata']
data = list(db.find({
    'subdata': 'train', 'box.occupation': {'$exists': True}
}))

In [None]:
sentences = []
for doc in data:
    for s in doc['sentences']:
        sentences.append(s.split())

In [None]:
len(sentences)

In [None]:
model = Word2Vec(sentences=sentences, vector_size=300, window=6, min_count=5, workers=8)

In [None]:
model.wv.most_similar('music')

## Clustering

In [None]:
from sklearn.cluster import KMeans
from collections import defaultdict

In [None]:
keys = model.wv.key_to_index

In [None]:
vocabulary = list(keys.keys())

In [None]:
voc = vocabulary[:10000]

In [None]:
M = []
for w in voc:
    M.append(model.wv.get_vector(w))
M = np.array(M)

In [None]:
kmeans = KMeans(n_clusters=100)
y_pred = kmeans.fit_predict(M)

In [None]:
clusters = defaultdict(list)
for i, y in enumerate(y_pred):
    clusters[y].append(voc[i])

In [None]:
clusters[1]