In [13]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/ajithj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# Synonmym Sets of "dark"

poses = { 'n': 'noun', 'v': 'verb', 's':'adj (s)', 'a': 'adj', 'r': 'adv' }
tokens = []
for synset in wn.synsets("dark"):
    tokens.extend([l.name() for l in synset.lemmas()])
    lemmas = ", ".join([l.name() for l in synset.lemmas()])
    print(f"{poses[synset.pos()]:<16s}: {lemmas}")

noun            : dark, darkness
noun            : iniquity, wickedness, darkness, dark
noun            : darkness, dark, shadow
noun            : night, nighttime, dark
noun            : dark, darkness
adj             : dark
adj             : dark
adj (s)         : dark
adj (s)         : black, dark, sinister
adj (s)         : dark
adj (s)         : dark, dour, glowering, glum, moody, morose, saturnine, sour, sullen
adj (s)         : benighted, dark
adj (s)         : dark, obscure
adj (s)         : blue, dark, dingy, disconsolate, dismal, gloomy, grim, sorry, drab, drear, dreary
adj (s)         : colored, coloured, dark, dark-skinned, non-white
adj (s)         : dark


In [15]:
# Hypernyms of "panda"
from nltk.corpus import wordnet as wn
panda = wn.synset("panda.n.01")
hyper = lambda s: s.hypernyms()
list(panda.closure(hyper))

[Synset('procyonid.n.01'),
 Synset('carnivore.n.01'),
 Synset('placental.n.01'),
 Synset('mammal.n.01'),
 Synset('vertebrate.n.01'),
 Synset('chordate.n.01'),
 Synset('animal.n.01'),
 Synset('organism.n.01'),
 Synset('living_thing.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [16]:
from nltk.corpus import wordnet as wn
panda = wn.synset("nautilus.n.01")
hyper = lambda s: s.hypernyms()
list(panda.closure(hyper))

  for synset in acyclic_breadth_first(self, rel, depth):


[Synset('submarine.n.01'),
 Synset('submersible.n.02'),
 Synset('warship.n.01'),
 Synset('military_vehicle.n.01'),
 Synset('ship.n.01'),
 Synset('vehicle.n.01'),
 Synset('vessel.n.02'),
 Synset('conveyance.n.03'),
 Synset('craft.n.02'),
 Synset('instrumentality.n.03'),
 Synset('artifact.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
integer_label_encoded = label_encoder.fit_transform(tokens)
label_encoded = integer_label_encoded.reshape(len(integer_label_encoded), 1)
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(label_encoded)

print(onehot_encoded)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Problem with words as discrete symbols
Example: in web search, if a user searches for “Seattle motel”, we would like to match
documents containing “Seattle hotel”
But:

motel = [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]

hotel = [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]

These two vectors are orthogonal
There is no natural notion of similarity for one-hot vectors!
Solution:
- Could try to rely on WordNet’s list of synonyms to get similarity?
- But it is well-known to fail badly: incompleteness, etc.
- Instead: learn to encode similarity in the vectors themselves

In [18]:
# “You shall know a word by the company it keeps” (J. R. Firth 1957: 11)

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [19]:
# Initialize the word2vec model

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [23]:
# Load the model
model = Word2Vec.load("word2vec.model")

from gensim.models import KeyedVectors
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

tokens = model.wv.most_similar('human', topn=10)
tokens

[('trees', 0.17272792756557465),
 ('eps', 0.16694682836532593),
 ('response', 0.11118265986442566),
 ('interface', 0.10940765589475632),
 ('system', 0.079634889960289),
 ('user', 0.04130302369594574),
 ('survey', 0.037712957710027695),
 ('graph', 0.00831594504415989),
 ('minors', -0.005896794609725475),
 ('computer', -0.07424270361661911)]

In [26]:

print(model.wv.similarity(tokens[-1][0], tokens[0][0]))

-0.032843146


In [27]:
from gensim.models import Phrases

# Train a bigram detector.
bigram_transformer = Phrases(common_texts)

# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec(bigram_transformer[common_texts], min_count=1)

In [31]:
? model.wv.similarity


[0;31mSignature:[0m  [0mmodel[0m[0;34m.[0m[0mwv[0m[0;34m.[0m[0msimilarity[0m[0;34m([0m[0mw1[0m[0;34m,[0m [0mw2[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute cosine similarity between two keys.

Parameters
----------
w1 : str
    Input key.
w2 : str
    Input key.

Returns
-------
float
    Cosine similarity between `w1` and `w2`.
[0;31mFile:[0m      ~/mambaforge/envs/cs224n/lib/python3.10/site-packages/gensim/models/keyedvectors.py
[0;31mType:[0m      method