In [57]:
from zoneinfo import available_timezones

import gensim
from gensim import downloader as api
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE

In [58]:
vocab = ['cat', 'dog', 'bird', 'fish', 'elephant']
# One-Hot encoding
for i, word in enumerate(vocab):
    one_hot = np.zeros(len(vocab), dtype=int)
    one_hot[i] = 1
    print(f"{word:10} = {one_hot}")

cat        = [1 0 0 0 0]
dog        = [0 1 0 0 0]
bird       = [0 0 1 0 0]
fish       = [0 0 0 1 0]
elephant   = [0 0 0 0 1]


In [59]:
embeddings = {
    'cat': np.array([0.8, 0.2, -0.1, 0.5]),
    'dog':np.array([0.7, 0.3, -0.2, 0.6]),
    'bird':np.array([0.1, 0.9, 0.8, -0.1]),
    'fish':np.array([-0.5, 0.1, -0.3, 0.9]),
    'elephant':np.array([0.6, 0.4, -0.3, 0.4])
}

for word, vec in enumerate(embeddings.items()):
    print(f"{word:10} = {vec}")

         0 = ('cat', array([ 0.8,  0.2, -0.1,  0.5]))
         1 = ('dog', array([ 0.7,  0.3, -0.2,  0.6]))
         2 = ('bird', array([ 0.1,  0.9,  0.8, -0.1]))
         3 = ('fish', array([-0.5,  0.1, -0.3,  0.9]))
         4 = ('elephant', array([ 0.6,  0.4, -0.3,  0.4]))


In [60]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

words = list(embeddings.keys())
for i in range(len(words)):
    for j in range(i+1, len(words)):
        word1, word2 = words[i], words[j]
        sim = cosine_similarity(embeddings[word1], embeddings[word2])
        print(f"{word1:10}: {word2:10}: {sim:.3f}")


cat       : dog       : 0.979
cat       : bird      : 0.111
cat       : fish      : 0.096
cat       : elephant  : 0.929
dog       : bird      : 0.100
dog       : fish      : 0.263
dog       : elephant  : 0.967
bird      : fish      : -0.222
bird      : elephant  : 0.132
fish      : elephant  : 0.201


In [61]:
corpus = [
    ['cat', 'sits'],
    ['cat', 'sits', 'on', 'soft', 'sofa'],
    ['small', 'dog', 'runs', 'fast'],
    ['dog', 'lies', 'under', 'the', 'table'],
    ['child', 'plays', 'with', 'a', 'red', 'ball'],
    ['man', 'reads'],
    ['man', 'reads', 'interesting', 'book', 'at', 'home'],
    ['woman', 'drinks', 'hot', 'coffee'],
    ['sun', 'shines'],
    ['bright', 'sun', 'shines', 'in', 'blue', 'sky'],
    ['rain', 'falls', 'on', 'wet', 'street'],
    ['bird', 'flies', 'high'],
    ['bird', 'flies', 'over', 'green', 'trees'],
    ['developer', 'writes', 'clean', 'code'],
    ['experienced', 'developer', 'writes', 'efficient', 'code', 'quickly'],
    ['students', 'learn', 'programming'],
    ['fat','students', 'learn', 'modern', 'programming', 'technologies'],
    ['neural', 'network', 'learns', 'from', 'data'],
    ['machine', 'learning', 'model', 'improves', 'with', 'more', 'data']
]

print(f"corpus length: {len(corpus)}")

all_words = set()
for sentence in corpus:
    all_words.update(sentence)
print(f"Unique words: {len(all_words)}")

corpus length: 19
Unique words: 68


In [62]:
model = Word2Vec(
    sentences=corpus,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4,
    epochs=100,
    sg=1)

print("Model Word2Vec learned")
print(f"Word length: {model.wv.vector_size}")
print("Words in dictionary")
print(list(model.wv.key_to_index.keys()))

Model Word2Vec learned
Word length: 50
Words in dictionary
['data', 'programming', 'learn', 'students', 'code', 'writes', 'developer', 'flies', 'bird', 'shines', 'sun', 'reads', 'man', 'with', 'dog', 'on', 'sits', 'cat', 'more', 'improves', 'model', 'learning', 'machine', 'from', 'learns', 'network', 'neural', 'technologies', 'modern', 'fat', 'quickly', 'efficient', 'experienced', 'clean', 'trees', 'green', 'over', 'high', 'street', 'wet', 'falls', 'rain', 'sky', 'blue', 'in', 'bright', 'coffee', 'hot', 'drinks', 'woman', 'home', 'at', 'book', 'interesting', 'ball', 'red', 'a', 'plays', 'child', 'table', 'the', 'under', 'lies', 'fast', 'runs', 'small', 'sofa', 'soft']


In [63]:
cat_vector = model.wv['cat']
print(f"Cat vector: {cat_vector}")

Cat vector: [-1.46846636e-04  7.13470727e-05  4.47533348e-05 -8.88036843e-03
  8.44077487e-03 -4.59163636e-03  5.15610306e-03  2.68266699e-03
  1.12458318e-02 -1.43130552e-02 -1.34174451e-02 -1.13731558e-02
  2.04127971e-02 -1.50801823e-03 -1.95932556e-02  3.25216446e-04
 -8.60978011e-03  1.18914098e-02 -2.06715800e-02  4.05027252e-03
 -1.85145214e-02  3.50792520e-03  1.35161625e-02  1.46568203e-02
 -1.47985993e-02 -1.19349500e-02 -1.39158070e-02 -1.47937667e-02
 -1.93910040e-02 -3.80832003e-03 -1.88230595e-03 -1.60194617e-02
  1.32786566e-02  1.98642188e-03  1.09200906e-02  3.71146179e-03
  3.12428805e-03 -1.47172408e-02 -3.73650901e-03  9.47283302e-03
 -9.68759786e-03  1.49215397e-03  5.38492249e-03 -2.55165552e-03
  2.13243738e-02  1.57046895e-02  4.54508141e-03  1.33171640e-02
  1.25190737e-02 -1.05128419e-02]


In [64]:
dog_vector = model.wv['dog']
print(f"Dog vector: {dog_vector}")

Dog vector: [ 1.6483413e-02 -1.0087132e-02  1.8561190e-02  1.8135976e-02
 -9.5770033e-03 -2.4265570e-05  9.9850949e-03 -6.0265083e-03
 -1.2546920e-02 -1.3663703e-02 -1.4045971e-03 -4.7742962e-03
  1.2493265e-02 -2.0530559e-03 -1.8440785e-03  6.8224138e-03
  1.0769965e-02 -9.2145882e-04 -1.4435704e-02 -2.1683894e-02
  1.3141345e-03  1.5437750e-02  5.5774637e-03 -1.8423598e-02
 -1.4611667e-02  1.3526761e-02 -8.0064954e-03  1.5274231e-02
 -1.3783852e-02  1.8315295e-02 -1.2848108e-02  4.4922573e-03
 -2.8530618e-03 -1.4317548e-02 -8.2964795e-03 -1.7386135e-03
 -8.3751893e-03 -3.0055547e-03 -1.4327246e-02  7.1145212e-03
  1.9171722e-02 -5.4647708e-03 -1.8665207e-03  6.9338782e-03
  2.0593235e-02 -1.3323303e-02 -1.4170116e-02 -7.9342574e-03
  1.9580152e-02  2.6528244e-03]


In [65]:
similar_to_cat = model.wv.most_similar('cat', topn=5)
for word, similarity in similar_to_cat:
    print(f"{word:10}: {similarity:.3f}")

red       : 0.396
small     : 0.387
drinks    : 0.327
writes    : 0.291
network   : 0.290


In [66]:
similar_to_dog = model.wv.most_similar('dog', topn=5)
for word, similarity in similar_to_dog:
    print(f"{word:10}: {similarity:.3f}")

reads     : 0.562
street    : 0.460
from      : 0.440
bright    : 0.328
rain      : 0.313


In [67]:
sim_cat_dog = model.wv.similarity('cat', 'dog')
print(sim_cat_dog)

0.04154672


In [68]:
print(model.wv.similarity('code', 'developer'))

0.021909581


In [69]:
available_models = api.info()['models']
for name, info in list(available_models.items())[:10]:
    size_mb = info.get('file_size', 0)/(1024*1024)

    print(f"{name:10}: {info} ({size_mb:.2f} MB)")

fasttext-wiki-news-subwords-300: {'num_records': 999999, 'file_size': 1005007116, 'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)', 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'parameters': {'dimension': 300}, 'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).', 'read_more': ['https://fasttext.cc/docs/en/english-vectors.html', 'https://arxiv.org/abs/1712.09405', 'https://arxiv.org/abs/1607.01759'], 'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af', 'file_name': 'fasttext-wiki-news-subwords-300.gz', 'parts': 1} (958.45 MB)
conceptnet-numberbatch-17-06-300: {'num_records': 1917247, 'file_size': 1225497562, 'base_dataset': 'ConceptNet, word2vec, GloVe, and OpenSubtitles 2016', 'reader_code': 'https://github.com/RaRe-Technologi

In [70]:
glove_model = api.load('glove-twitter-25')
print('model loaded')
print(f"dictionary size: {len(glove_model)}")
print(f"vector size: {glove_model.vector_size}")

model loaded
dictionary size: 1193514
vector size: 25


In [71]:
for word, similarity in glove_model.most_similar('king', topn=10):
    print(f"{word:10}: {similarity}")
print()
for word, similarity in glove_model.most_similar('computer', topn=10):
    print(f"{word:10}: {similarity}")

prince    : 0.9337409734725952
queen     : 0.9202421307563782
aka       : 0.9176921844482422
lady      : 0.9163240790367126
jack      : 0.9147354364395142
's        : 0.9066898226737976
stone     : 0.8982374668121338
mr.       : 0.8919409513473511
the       : 0.889343798160553
star      : 0.8892088532447815

camera    : 0.907833456993103
cell      : 0.891890287399292
server    : 0.874466598033905
device    : 0.8693525195121765
wifi      : 0.863125741481781
screen    : 0.8621907234191895
app       : 0.8615543246269226
case      : 0.8587921857833862
remote    : 0.8583616018295288
file      : 0.8575270771980286


In [78]:
# king - man + woman = ?
result = glove_model.most_similar(positive=["king", "woman"], negative=["man"], topn=5)
for word, similarity in result:
    print(f"{word:10}: {similarity}")


meets     : 0.8841923475265503
prince    : 0.832163393497467
queen     : 0.8257461190223694
â€™s        : 0.8174097537994385
crow      : 0.813499391078949


In [79]:
# paris - france + russia = ?
result2 = glove_model.most_similar(positive=["paris", "russia"], negative=["france"], topn=5)
for word, similarity in result2:
    print(f"{word:10}: {similarity}")

brazil    : 0.8781961798667908
italy     : 0.8547845482826233
australia : 0.8511278033256531
city      : 0.8145042061805725
hawaii    : 0.8138893246650696


In [80]:
result3 = glove_model.most_similar(positive=["fast", "bad"], negative=["slow"], topn=5)
for word, similarity in result3:
    print(f"{word:10}: {similarity}")

yet       : 0.8893000483512878
also      : 0.8887512683868408
made      : 0.879071056842804
'd        : 0.877172589302063
want      : 0.8767794370651245
