# Explore the Gensim implementation

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

In [7]:
wv = KeyedVectors.load_word2vec_format(datapath("/Users/flint/Data/word2vec/GoogleNews-vectors-negative300.bin"), 
                                       binary=True)

## Similarity

In [8]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [20]:
for x, y in wv.most_similar('car'):
    print(x, y)

vehicle 0.7821096181869507
cars 0.7423831224441528
SUV 0.7160962224006653
minivan 0.6907036900520325
truck 0.6735789775848389
Car 0.6677608489990234
Ford_Focus 0.667320191860199
Honda_Civic 0.6626849174499512
Jeep 0.651133120059967
pickup_truck 0.6441438794136047


## Analogy

FRANCE : PARIS = ITALY : ?

PARIS - FRANCE + ITALY

In [21]:
wv.most_similar(positive=['King', 'woman'], negative=['man'])

[('Queen', 0.5515626668930054),
 ('Oprah_BFF_Gayle', 0.47597548365592957),
 ('Geoffrey_Rush_Exit', 0.46460166573524475),
 ('Princess', 0.4533674716949463),
 ('Yvonne_Stickney', 0.4507041573524475),
 ('L._Bonauto', 0.4422135353088379),
 ('gal_pal_Gayle', 0.4408389925956726),
 ('Alveda_C.', 0.4402790665626526),
 ('Tupou_V.', 0.4373864233493805),
 ('K._Letourneau', 0.4351031482219696)]

## Not matching

In [22]:
wv.doesnt_match("school professor apple student".split())

'apple'

## Mean

In [30]:
vp = wv['school']
vr = wv['professor']
vx = wv['student']
m = (vp + vr + vx) / 3

In [31]:
wv.similar_by_vector(m)

[('student', 0.8481254577636719),
 ('professor', 0.7627506852149963),
 ('teacher', 0.6942789554595947),
 ('school', 0.6849855780601501),
 ('students', 0.6768636703491211),
 ('lecturer', 0.6700003147125244),
 ('faculty', 0.645453155040741),
 ('university', 0.6376535892486572),
 ('professors', 0.6346085667610168),
 ('associate_professor', 0.6325882077217102)]

In [34]:
pairs = [
    ('lecturer', 'school'),
    ('lecturer', 'professor'),
    ('lecturer', 'student'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'lecturer'	'school'	0.18
'lecturer'	'professor'	0.80
'lecturer'	'student'	0.43


## Context

In [None]:
wv.most_similar('buy')

In [None]:
wv.similarity('buy', 'money')

## Train a custom model

In [None]:
import gensim.models

In [None]:
sentences = _ # assume there's one document per line, tokens separated by whitespace
model = gensim.models.Word2Vec(sentences=sentences)

## Exercise: train a model from wordnet

In [11]:
from nltk.corpus import wordnet as wn
import nltk

In [12]:
words = ['cat', 'dog', 'bird', 'fish']

In [13]:
h = lambda s: s.hypernyms()
p = lambda s: s.hyponyms()

def get_pseudo_sentences(word, context=3):
    sentences = []
    for s in wn.synsets(word):
        for lemma in s.lemmas():
            sentences.append([lemma.name(), s.name()])
        for i, j in enumerate(s.closure(h)):
            sentences.append([s.name(), j.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
        for i, j in enumerate(s.closure(p)):
            sentences.append([j.name(), s.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
    return sentences

In [14]:
sentences = []
for w in words:
    sentences += get_pseudo_sentences(w)

In [17]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

In [18]:
model.wv.most_similar('fish')

[('fish.v.01', 0.33891671895980835),
 ('shuttle', 0.3145788609981537),
 ('run_down', 0.29708608984947205),
 ('solid_food', 0.28112900257110596),
 ('brail', 0.27431321144104004),
 ('grownup', 0.27283117175102234),
 ('pictorial_representation', 0.2099177986383438),
 ('eel.n.01', 0.20881065726280212),
 ('weenie', 0.2070414274930954),
 ('hot_dog', 0.1991277039051056)]