# Explore the Gensim implementation

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

## Similarity

In [4]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


## Analogy

In [5]:
wv.most_similar(positive=['man','king'], negative=['woman'])

[('kings', 0.6490574479103088),
 ('clown_prince', 0.5009064674377441),
 ('prince', 0.4854173958301544),
 ('crown_prince', 0.4816294312477112),
 ('King', 0.4721395969390869),
 ('ruler', 0.4700629711151123),
 ('sultan', 0.46399134397506714),
 ('undisputed_king', 0.463204026222229),
 ('princes', 0.4552575349807739),
 ('monarch', 0.45388489961624146)]

## Not matching

In [6]:
wv.doesnt_match("school professor apple student".split())

'apple'

## Context

In [7]:
wv.most_similar('buy')

[('sell', 0.8308461308479309),
 ('purchase', 0.7639905214309692),
 ('buying', 0.7209187150001526),
 ('bought', 0.7087080478668213),
 ('buys', 0.6617438793182373),
 ('Buy', 0.5850198268890381),
 ('tobuy', 0.5843993425369263),
 ('purchased', 0.5826955437660217),
 ('Buying', 0.578020453453064),
 ('acquire', 0.5730166435241699)]

In [11]:
wv.similarity('buy', 'money')

0.31760776

## Train a custom model

In [None]:
import gensim.models

In [None]:
sentences = _ # assume there's one document per line, tokens separated by whitespace
model = gensim.models.Word2Vec(sentences=sentences)

## Exercise: train a model from wordnet

In [None]:
from nltk.corpus import wordnet as wn
import nltk

In [None]:
words = ['cat', 'dog', 'bird', 'fish']

In [None]:
h = lambda s: s.hypernyms()
p = lambda s: s.hyponyms()

def get_pseudo_sentences(word, context=3):
    sentences = []
    for s in wn.synsets(word):
        for lemma in s.lemmas():
            sentences.append([lemma.name(), s.name()])
        for i, j in enumerate(s.closure(h)):
            sentences.append([s.name(), j.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
        for i, j in enumerate(s.closure(p)):
            sentences.append([j.name(), s.name()])
            for lemma in j.lemmas():
                sentences.append([lemma.name(), j.name()])
            if i == context:
                break
    return sentences

In [None]:
sentences = []
for w in words:
    sentences += get_pseudo_sentences(w)

In [None]:
model = gensim.models.Word2Vec(sentences=sentences, size=100, window=5, min_count=1, workers=4)

In [None]:
model.wv.most_similar('fish')