# Explore the Gensim implementation
> Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., & Joulin, A. (2017). Advances in pre-training distributed word representations. arXiv preprint arXiv:1712.09405.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [4]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

In [5]:
v = wv.get_vector('peach')

In [7]:
wv.most_similar('peach')

[('peaches', 0.7132657766342163),
 ('strawberry', 0.6729940176010132),
 ('apricot', 0.6522811651229858),
 ('melon', 0.6460169553756714),
 ('plums', 0.6374126076698303),
 ('pear', 0.6346296072006226),
 ('berry', 0.6229482889175415),
 ('mandarin_orange', 0.617423951625824),
 ('nectarine', 0.6131626963615417),
 ('strawberries', 0.6110720634460449)]

## Similarity

In [8]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [9]:
for x, y in wv.most_similar('car'):
    print(x, y)

vehicle 0.7821096181869507
cars 0.7423831224441528
SUV 0.7160962224006653
minivan 0.6907036900520325
truck 0.6735789775848389
Car 0.6677608489990234
Ford_Focus 0.667320191860199
Honda_Civic 0.6626849174499512
Jeep 0.651133120059967
pickup_truck 0.6441438794136047


In [32]:
vectors = []
for word in ['peach', 'apricot', 'strawberry', 'fish', 'meat', 'vegetables', 'milk']:
    vectors.append(wv.get_vector(word))
V = np.array(vectors)

In [33]:
V.shape

(7, 300)

In [34]:
v = V.mean(axis=0)
#v = v - wv.get_vector('car')

In [35]:
wv.similar_by_vector(v, topn=20)

[('strawberry', 0.7836589813232422),
 ('vegetables', 0.7646373510360718),
 ('strawberries', 0.7575767040252686),
 ('peaches', 0.754625141620636),
 ('berries', 0.741344153881073),
 ('tomato', 0.7389938831329346),
 ('apricots', 0.7348976135253906),
 ('tomatoes', 0.7201975584030151),
 ('cherries', 0.719590961933136),
 ('asparagus', 0.7166165709495544),
 ('fruit', 0.7139172554016113),
 ('peach', 0.7131540179252625),
 ('pears', 0.7125092148780823),
 ('Bing_cherries', 0.7114951014518738),
 ('sweet_potatoes', 0.7094361782073975),
 ('vegetable', 0.7081252932548523),
 ('apricot', 0.7071393132209778),
 ('blueberries', 0.7059823870658875),
 ('blueberry', 0.703717052936554),
 ('fresh_figs', 0.7034894227981567)]

## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [37]:
wv.most_similar(positive=['King', 'woman'], negative=['man'])

[('Queen', 0.5515626668930054),
 ('Oprah_BFF_Gayle', 0.47597548365592957),
 ('Geoffrey_Rush_Exit', 0.46460166573524475),
 ('Princess', 0.4533674716949463),
 ('Yvonne_Stickney', 0.4507041573524475),
 ('L._Bonauto', 0.4422135353088379),
 ('gal_pal_Gayle', 0.4408389925956726),
 ('Alveda_C.', 0.4402790665626526),
 ('Tupou_V.', 0.4373864233493805),
 ('K._Letourneau', 0.4351031482219696)]

## Not matching

In [40]:
wv.doesnt_match("school professor car student".split())

'car'

## Mean

In [41]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [42]:
wv.similar_by_vector(m)

[('student', 0.8481254577636719),
 ('professor', 0.7627506852149963),
 ('teacher', 0.6942789554595947),
 ('school', 0.6849855780601501),
 ('students', 0.6768636703491211),
 ('lecturer', 0.6700003147125244),
 ('faculty', 0.645453155040741),
 ('university', 0.6376535892486572),
 ('professors', 0.6346085667610168),
 ('associate_professor', 0.6325882077217102)]

In [43]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
    ('lecturer', 'teacher'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'lecturer'	'school'	0.18
'lecturer'	'professor'	0.80
'lecturer'	'student'	0.43
'lecturer'	'teacher'	0.48


## Context

In [44]:
wv.most_similar('buy')

[('sell', 0.8308461308479309),
 ('purchase', 0.7639904618263245),
 ('buying', 0.7209187746047974),
 ('bought', 0.7087081074714661),
 ('buys', 0.6617438197135925),
 ('Buy', 0.5850198864936829),
 ('tobuy', 0.5843992829322815),
 ('purchased', 0.582695484161377),
 ('Buying', 0.578020453453064),
 ('acquire', 0.5730165839195251)]

In [45]:
wv.similarity('buy', 'money')

0.31760776

## Train a custom model

In [None]:
import gensim.models

In [None]:
sentences = _ # assume there's one document per line, tokens separated by whitespace
model = gensim.models.Word2Vec(sentences=sentences)

## Exercise: train a model from your data

In [46]:
import pymongo

In [47]:
db = pymongo.MongoClient()['wikibio']['rawdata']
data = list(db.find({
    'subdata': 'train', 'box.occupation': {'$exists': True}
}))

In [48]:
sentences = []
for doc in data:
    for s in doc['sentences']:
        sentences.append(s.split())

In [49]:
len(sentences)

647383

In [50]:
model = Word2Vec(sentences=sentences, vector_size=300, window=6, min_count=5, workers=8)

In [53]:
model.wv.most_similar('music')

[('dance', 0.5563018918037415),
 ('tunes', 0.5537866950035095),
 ('hip-hop', 0.5531975030899048),
 ('rhythms', 0.5528717637062073),
 ('electronica', 0.5446825623512268),
 ('techno', 0.5336247086524963),
 ('sounds', 0.5252417922019958),
 ('orchestral', 0.5198730826377869),
 ('jazz', 0.5157392024993896),
 ('beats', 0.5152225494384766)]

## Clustering

In [66]:
from sklearn.cluster import KMeans
from collections import defaultdict

In [57]:
keys = model.wv.key_to_index

In [58]:
vocabulary = list(keys.keys())

In [62]:
voc = vocabulary[:10000]

In [63]:
M = []
for w in voc:
    M.append(model.wv.get_vector(w))
M = np.array(M)

In [65]:
kmeans = KMeans(n_clusters=100)
y_pred = kmeans.fit_predict(M)

In [69]:
clusters = defaultdict(list)
for i, y in enumerate(y_pred):
    clusters[y].append(voc[i])

In [71]:
clusters[1]

['name',
 'family',
 'children',
 'married',
 'son',
 'father',
 'wife',
 'brother',
 'daughter',
 'mother',
 'child',
 'brothers',
 'husband',
 'sister',
 'whom',
 'partner',
 'friend',
 'parents',
 'friends',
 'younger',
 'sons',
 'marriage',
 'daughters',
 'relationship',
 'older',
 'uncle',
 'nickname',
 'cousin',
 'sisters',
 'grandfather',
 'grandson',
 'nephew',
 'widow',
 'siblings',
 'grandmother']