# Explore the Gensim implementation
> Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., & Joulin, A. (2017). Advances in pre-training distributed word representations. arXiv preprint arXiv:1712.09405.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [4]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

## Similarity

In [5]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [7]:
for x, y in wv.most_similar('car'):
    print(x, y)

vehicle 0.7821096181869507
cars 0.7423831224441528
SUV 0.7160962224006653
minivan 0.6907036900520325
truck 0.6735789775848389
Car 0.6677608489990234
Ford_Focus 0.667320191860199
Honda_Civic 0.6626849174499512
Jeep 0.651133120059967
pickup_truck 0.6441438794136047


In [10]:
vectors = []
for word in ['car', 'minivan', 'bicycle', 'airplane']:
    vectors.append(wv.get_vector(word))
V = np.array(vectors)

In [13]:
v = V.mean(axis=0)
v = v - wv.get_vector('car')

In [14]:
wv.similar_by_vector(v)

[('LightHawk', 0.3567371666431427),
 ('Beaver_floatplane', 0.3410896956920624),
 ('Bluebills', 0.3352811932563782),
 ('airplane', 0.32490819692611694),
 ('Volk_Field', 0.307051420211792),
 ('Andersland', 0.30294421315193176),
 ('Expedia_Expedia.com', 0.30243098735809326),
 ('NASA_Weightless_Wonder', 0.30234652757644653),
 ('Cessna_###B', 0.2979174852371216),
 ('propeller_plane', 0.2970975339412689)]

## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [16]:
wv.most_similar(positive=['Paris', 'Italy'], negative=['France'])

[('Milan', 0.7222141623497009),
 ('Rome', 0.702830970287323),
 ('Palermo_Sicily', 0.5967570543289185),
 ('Italian', 0.5911272764205933),
 ('Tuscany', 0.5632812976837158),
 ('Bologna', 0.5608358383178711),
 ('Sicily', 0.5596384406089783),
 ('Bologna_Italy', 0.5470058917999268),
 ('Berna_Milan', 0.5464027523994446),
 ('Genoa', 0.5308900475502014)]

## Not matching

In [17]:
wv.doesnt_match("school professor apple student".split())

'apple'

## Mean

In [18]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [19]:
wv.similar_by_vector(m)

[('student', 0.8481254577636719),
 ('professor', 0.7627506852149963),
 ('teacher', 0.6942789554595947),
 ('school', 0.6849855780601501),
 ('students', 0.6768636703491211),
 ('lecturer', 0.6700003147125244),
 ('faculty', 0.645453155040741),
 ('university', 0.6376535892486572),
 ('professors', 0.6346085667610168),
 ('associate_professor', 0.6325882077217102)]

In [20]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
    ('lecturer', 'teacher'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'lecturer'	'school'	0.18
'lecturer'	'professor'	0.80
'lecturer'	'student'	0.43
'lecturer'	'teacher'	0.48


## Context

In [21]:
wv.most_similar('buy')

[('sell', 0.8308461308479309),
 ('purchase', 0.7639904618263245),
 ('buying', 0.7209187746047974),
 ('bought', 0.7087081074714661),
 ('buys', 0.6617438197135925),
 ('Buy', 0.5850198864936829),
 ('tobuy', 0.5843992829322815),
 ('purchased', 0.582695484161377),
 ('Buying', 0.578020453453064),
 ('acquire', 0.5730165839195251)]

In [22]:
wv.similarity('buy', 'money')

0.31760776

## Train a custom model

In [23]:
import gensim.models

### Generate a global model for YELP reviews

In [24]:
import json
from nltk.tokenize import word_tokenize
from string import punctuation

In [25]:
review_data_file = '../lexicon/data/yelp_sample.json'
with open(review_data_file, 'r') as infile:
    R = json.load(infile)

In [26]:
R[0]

{'content': "Red, white and bleu salad was super yum and a great addition to the menu! This location was clean with great service and food served at just the right temps! Kids pizza is always a hit too with lots of great side dish options for the kiddos! When I'm on this side of town, this will definitely be a spot I'll hit up again!",
 'date': '2014-02-17',
 'stars': 4,
 'useful': 1,
 'funny': 0,
 'cool': 0,
 'business': 'Ue6-WhXvI-_1xUIuapl0zQ',
 'id': '----X0BIDP9tA49U3RvdSQ',
 'categories': ['American (Traditional)', 'Burgers', 'Restaurants']}

In [27]:
data = [[x.lower() for x in word_tokenize(doc['content']) if x not in punctuation] for doc in R]

In [28]:
data[0][:6]

['red', 'white', 'and', 'bleu', 'salad', 'was']

In [29]:
R0 = gensim.models.Word2Vec(sentences=data, epochs=25, window=6, vector_size=100)

In [30]:
R0.wv.most_similar('car')

[('vehicle', 0.7438929080963135),
 ('battery', 0.6146064400672913),
 ('bike', 0.6076012849807739),
 ('dealership', 0.5746821165084839),
 ('suit', 0.5642380118370056),
 ('teeth', 0.5638452172279358),
 ('contract', 0.5471097826957703),
 ('bank', 0.5426754951477051),
 ('tire', 0.5422413349151611),
 ('property', 0.5409041047096252)]

### Application example: use graph community detection to find aspects

In [31]:
import networkx as nx

In [32]:
min_sim = 0.7
G = nx.Graph()
for word in tqdm(R0.wv.index_to_key):
    for match, sim in R0.wv.most_similar(word):
        if sim >= min_sim:
            G.add_edge(word, match, sim=sim)

  0%|          | 0/8837 [00:00<?, ?it/s]

In [33]:
for a, b, c in G.edges(data=True):
    print(a, b, c)
    break

you u {'sim': 0.778407633304596}


### Visualize

In [34]:
from pyvis.network import Network

In [35]:
nt = Network('1500px', '1500px')
nt.from_nx(G.subgraph(list(G.nodes)[:100]))
nt.show('word2vec.html')

### Community detection

In [36]:
from networkx.algorithms.community import greedy_modularity_communities

In [37]:
communities = greedy_modularity_communities(G)
for community in communities:
    print(list(community)[:10])

['chocolate', 'spiced', 'salami', 'masala', 'mahi', 'flatbread', 'seed', 'risotto', 'skins', 'peppers']
["j'ai", 'belle', 'sans', 'mes', 'suis', 'pu', 'même', 'vous', 'et', 'cette']
['der', 'alles', 'hatte', 'nur', 'ganz', 'empfehlung', 'es', 'waren', 'über', 'das']
['cheesy', 'tender', 'flavour', 'buttery', 'hints', 'thick', 'juicy', 'doughy', 'sugary', 'sticky']
['yu', 'attentive', 'polite', 'talented', 'dom', 'demeanor', 'skilled', 'respectful', 'gentle', 'professional']
['hero', 'dumplings', 'bone-in', 'kabob', 'chops', 'kung', 'pao', 'siu', 'szechuan', 'corned']
['sexy', 'decorated', 'spacious', 'atmosphere', 'interior', 'quiet', 'chic', 'upbeat', 'elegant', 'cozy']
['screwed', 'messed', 'grown', 'hyped', 'wrapping', 'sums', 'popping', 'hung', 'summed', 'mt']
['sister', 'husband', 'coworker', 'brother', 'son', 'hubby', 'wife', 'father', 'friend', 'boyfriend']
['forty', '15-20', '50', 'fifteen', '90', '20', '35', 'thirty', '40', '45']
['unassuming', 'ultrasound', 'institution', 'as

## Update an existing model
Let's create a collection for each category

In [38]:
from collections import defaultdict

In [39]:
corpora = defaultdict(list)
for review in R:
    content, categories = review['content'], review['categories']
    tokens = [x.lower() for x in word_tokenize(content) if x not in punctuation]
    for category in categories:
        corpora[category].append(tokens)

In [40]:
corpora['RV Rental'][0][:10]

['unbelievable',
 'how',
 'the',
 'employees',
 'that',
 'work',
 'in',
 'the',
 'front',
 'part']

### Update the global model with the local information (i.e., create a model for each category)

In [41]:
import copy

In [42]:
selected_categories = ['Burgers', 'Indian', 'Italian', 'Seafood']
models = {}
runs = [(c, d) for c, d in corpora.items() if c in selected_categories]
for category, data in tqdm(runs):
    models[category] = copy.deepcopy(R0)
    models[category].train(data, total_examples=R0.corpus_count, epochs=R0.epochs)

  0%|          | 0/4 [00:00<?, ?it/s]

In [43]:
print(list(models.keys()))

['Burgers', 'Indian', 'Italian', 'Seafood']


In [44]:
word = 'food'
for cat, m in models.items():
    print(cat, [x for x, y in m.wv.most_similar(word, topn=20)][:5])

Burgers ['sushi', 'sangria', 'ambiance', 'breakfasts', 'meal']
Indian ['sushi', 'meal', 'pizza', 'it', 'drinks']
Italian ['sushi', 'consistently', 'breakfasts', 'meal', 'ambience']
Seafood ['sushi', 'sangria', 'presentation', 'fare', 'foods']


#### Filter common words

In [45]:
counter = defaultdict(lambda: 0)
for cat, m in models.items():
    for x, y in m.wv.most_similar(word, topn=20):
        counter[x] += 1
for cat, m in models.items():
    print(cat, [x for x, y in m.wv.most_similar(word, topn=20) if counter[x] < 3])

Burgers ['atmosphere', 'pricing', 'insanely', 'environment', 'pho']
Indian ['it', 'drinks', 'attentive', 'meals', 'restaurant']
Italian ['atmosphere', 'margaritas', 'value', 'drinks', 'pricing', 'attentive', 'relatively', 'coffee', 'desserts', 'environment']
Seafood ['foods', 'coffee', 'meals', 'schnitzel', 'sandwiches', 'insanely']


## Exercise: train a model from wordnet

How `word2vec` may be used to disambiguate words and lookup for synsets

In [46]:
from nltk.corpus import wordnet as wn
import nltk

words = ['cat', 'dog', 'bird', 'fish']

h = lambda s: s.hypernyms()
p = lambda s: s.hyponyms()

def get_pseudo_sentences(word, context=3):
    sentences = []
    for s in wn.synsets(word):
        for lemma in s.lemmas():
            sentences.append([lemma.name(), s.name()])
        for i, j in enumerate(s.closure(h)):
            sentences.append([s.name(), j.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
        for i, j in enumerate(s.closure(p)):
            sentences.append([j.name(), s.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
    return sentences

sentences = []
for w in words:
    sentences += get_pseudo_sentences(w)
    
print(sentences[0])

model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

model.wv.most_similar('fish')

['cat', 'cat.n.01']


[('fish.v.01', 0.33891671895980835),
 ('shuttle', 0.3145788609981537),
 ('run_down', 0.29708608984947205),
 ('solid_food', 0.28112900257110596),
 ('brail', 0.27431318163871765),
 ('grownup', 0.27283111214637756),
 ('pictorial_representation', 0.209917813539505),
 ('eel.n.01', 0.20881067216396332),
 ('weenie', 0.2070414274930954),
 ('hot_dog', 0.19912771880626678)]