# Explore the Gensim implementation
> Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., & Joulin, A. (2017). Advances in pre-training distributed word representations. arXiv preprint arXiv:1712.09405.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [4]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

## Similarity

In [5]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [7]:
for x, y in wv.most_similar('beautiful'):
    print(x, y)

gorgeous 0.8353005051612854
lovely 0.8106936812400818
stunningly_beautiful 0.7329413294792175
breathtakingly_beautiful 0.7231340408325195
wonderful 0.6854086518287659
fabulous 0.6700063943862915
loveliest 0.6612576246261597
prettiest 0.6595001816749573
beatiful 0.6593326330184937
magnificent 0.6591402888298035


In [8]:
vectors = []
for word in ['car', 'minivan', 'bicycle', 'airplane']:
    vectors.append(wv.get_vector(word))
V = np.array(vectors)

In [10]:
v = V.mean(axis=0)
#v = v - wv.get_vector('car')

In [11]:
wv.similar_by_vector(v)

[('car', 0.852258026599884),
 ('minivan', 0.8156529664993286),
 ('vehicle', 0.7754934430122375),
 ('SUV', 0.7660486698150635),
 ('bicycle', 0.7264742255210876),
 ('pickup_truck', 0.723552942276001),
 ('scooter', 0.7198848724365234),
 ('truck', 0.7041884064674377),
 ('Jeep', 0.7000145316123962),
 ('motorcycle', 0.6802986264228821)]

## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [12]:
wv.most_similar(positive=['Paris', 'Italy'], negative=['France'])

[('Milan', 0.7222141623497009),
 ('Rome', 0.702830970287323),
 ('Palermo_Sicily', 0.5967570543289185),
 ('Italian', 0.5911272764205933),
 ('Tuscany', 0.5632812976837158),
 ('Bologna', 0.5608358383178711),
 ('Sicily', 0.5596384406089783),
 ('Bologna_Italy', 0.5470058917999268),
 ('Berna_Milan', 0.5464027523994446),
 ('Genoa', 0.5308900475502014)]

In [13]:
wv.most_similar(positive=['King', 'Woman'], negative=['Man'])

[('Queen', 0.4929387867450714),
 ('Tupou_V.', 0.45174285769462585),
 ('Oprah_BFF_Gayle', 0.4422132968902588),
 ('Jackson', 0.440250426530838),
 ('NECN_Alison', 0.4331282675266266),
 ('Whitfield', 0.42834725975990295),
 ('Ida_Vandross', 0.42084527015686035),
 ('prosecutor_Dan_Satterberg', 0.420758992433548),
 ('martin_Luther_King', 0.42059651017189026),
 ('Coretta_King', 0.4202733635902405)]

## Not matching

In [None]:
wv.doesnt_match("school professor apple student".split())

## Mean

In [None]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [None]:
wv.similar_by_vector(m)

In [None]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
    ('lecturer', 'teacher'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

## Context

In [None]:
wv.most_similar('buy')

In [None]:
wv.similarity('buy', 'money')

## Train a custom model

In [None]:
import gensim.models

### Generate a global model for YELP reviews

In [None]:
import json
from nltk.tokenize import word_tokenize
from string import punctuation

In [None]:
review_data_file = '../lexicon/data/yelp_sample.json'
with open(review_data_file, 'r') as infile:
    R = json.load(infile)

In [None]:
R[0]

In [None]:
data = [[x.lower() for x in word_tokenize(doc['content']) if x not in punctuation] for doc in R]

In [None]:
data[0][:6]

In [None]:
R0 = gensim.models.Word2Vec(sentences=data, epochs=25, window=6, vector_size=100)

In [None]:
R0.wv.most_similar('car')

### Application example: use graph community detection to find aspects

In [None]:
import networkx as nx

In [None]:
min_sim = 0.7
G = nx.Graph()
for word in tqdm(R0.wv.index_to_key):
    for match, sim in R0.wv.most_similar(word):
        if sim >= min_sim:
            G.add_edge(word, match, sim=sim)

In [None]:
for a, b, c in G.edges(data=True):
    print(a, b, c)
    break

### Visualize

In [None]:
from pyvis.network import Network

In [None]:
nt = Network('1500px', '1500px')
nt.from_nx(G.subgraph(list(G.nodes)[:100]))
nt.show('word2vec.html')

### Community detection

In [None]:
from networkx.algorithms.community import greedy_modularity_communities

In [None]:
communities = greedy_modularity_communities(G)
for community in communities:
    print(list(community)[:10])

## Update an existing model
Let's create a collection for each category

In [None]:
from collections import defaultdict

In [None]:
corpora = defaultdict(list)
for review in R:
    content, categories = review['content'], review['categories']
    tokens = [x.lower() for x in word_tokenize(content) if x not in punctuation]
    for category in categories:
        corpora[category].append(tokens)

In [None]:
corpora['RV Rental'][0][:10]

### Update the global model with the local information (i.e., create a model for each category)

In [None]:
import copy

In [None]:
selected_categories = ['Burgers', 'Indian', 'Italian', 'Seafood']
models = {}
runs = [(c, d) for c, d in corpora.items() if c in selected_categories]
for category, data in tqdm(runs):
    models[category] = copy.deepcopy(R0)
    models[category].train(data, total_examples=R0.corpus_count, epochs=R0.epochs)

In [None]:
print(list(models.keys()))

In [None]:
word = 'food'
for cat, m in models.items():
    print(cat, [x for x, y in m.wv.most_similar(word, topn=20)][:5])

#### Filter common words

In [None]:
counter = defaultdict(lambda: 0)
for cat, m in models.items():
    for x, y in m.wv.most_similar(word, topn=20):
        counter[x] += 1
for cat, m in models.items():
    print(cat, [x for x, y in m.wv.most_similar(word, topn=20) if counter[x] < 3])

## Exercise: train a model from wordnet

How `word2vec` may be used to disambiguate words and lookup for synsets

In [None]:
from nltk.corpus import wordnet as wn
import nltk

words = ['cat', 'dog', 'bird', 'fish']

h = lambda s: s.hypernyms()
p = lambda s: s.hyponyms()

def get_pseudo_sentences(word, context=3):
    sentences = []
    for s in wn.synsets(word):
        for lemma in s.lemmas():
            sentences.append([lemma.name(), s.name()])
        for i, j in enumerate(s.closure(h)):
            sentences.append([s.name(), j.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
        for i, j in enumerate(s.closure(p)):
            sentences.append([j.name(), s.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
    return sentences

sentences = []
for w in words:
    sentences += get_pseudo_sentences(w)
    
print(sentences[0])

model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

model.wv.most_similar('fish')