In [3]:
import gdown

In [4]:
from typing import *
import numpy as np
import numpy.typing as npt
from scipy import spatial

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import functools

In [5]:
# Slower but always guaranteed to work
# !wget -nc https://lazyprogrammer.me/course_files/nlp/GoogleNews-vectors-negative300.bin.gz


In [6]:
from gensim.models import KeyedVectors

## Word2Vec Pretrained

In [5]:
wordVectors = KeyedVectors.load_word2vec_format(
    "../data/GoogleNews-vectors-negative300.bin",
    binary=True
)

In [15]:
def findAnalogies(wordVectors: KeyedVectors, pair, second) -> str:
    # first = pair[0] - pair[1] + first
    # king - man = queen - woman
    # queen = king - man + woman
    r = wordVectors.most_similar(positive=[pair[0], second], negative=[pair[1]])
    # print(r) # tuples of word and score
    print("%s - %s = %s - %s" % (pair[0], pair[1], r[0][0], second))
    return r[0][0]

def nearestNeighbors(wordVectors: KeyedVectors, w: str) -> List[str]:
    r = wordVectors.most_similar(w)
    return [word for word, score in r]

In [16]:
findAnalogies(wordVectors, ('king', 'man'), 'woman')

king - man = queen - woman


'queen'

In [17]:
findAnalogies(wordVectors, ('france', 'paris'), 'london')

france - paris = england - london


'england'

In [19]:
nearestNeighbors(wordVectors, "england")

['liverpool',
 'chelsea',
 'fulham',
 'tottenham',
 'rooney',
 'man_utd',
 'torres',
 'ronaldo',
 'spain',
 'gerrard']

In [20]:
nearestNeighbors(wordVectors, "spain")

['madrid',
 'portugal',
 'barcelona',
 'italy',
 'england',
 'europe',
 'diego',
 'carlos',
 'real_madrid',
 'sweden']

In [23]:
nearestNeighbors(wordVectors, "barcelona")

['real_madrid',
 'barca',
 'madrid',
 'drogba',
 'man_utd',
 'messi',
 'ronaldinho',
 'chelsea',
 'tottenham',
 'liverpool']

# GloVe

In [7]:
# load glove 50d trained on 6b tokens
filePath = "../data/glove.6B.50d.txt"
embeddings = {}
with open(filePath, "r", encoding="utf-8") as fp:
    for line in fp:
        values = line.split()
        embeddings[values[0]] = np.asarray(values[1:], dtype=np.float32) # assuming single words


In [8]:
embeddings["the"]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

In [12]:
def nearestNeighborsGlove(embeddings: Dict[str, npt.NDArray], w:str, k = 5)-> List[str]:
    inputEmbedding = embeddings[w]
    return nearestNeighborsByEmbedding(embeddings, inputEmbedding, k)[1:]

def nearestNeighborsByEmbedding(embeddings: Dict[str, npt.NDArray], inputEmbedding:npt.NDArray, k = 5)-> List[str]:
    order = lambda candidate: spatial.distance.euclidean(embeddings[candidate], inputEmbedding)
    ranks = sorted(embeddings.keys(), key=order)
    return ranks[:k]

def riddle(embeddings: Dict[str, npt.NDArray], words: List[str]) -> str:
    inputEmbeddings = [embeddings[w] for w in words]
    combined = functools.reduce(lambda a, b: a + b, inputEmbeddings) # should we average?
    return nearestNeighborsByEmbedding(embeddings, combined)

def analogy(embeddings: Dict[str, npt.NDArray], w1: str, w2:str, w11:str) -> str:
    combined = embeddings[w1] - embeddings[w2] + embeddings[w11]
    return nearestNeighborsByEmbedding(embeddings, combined)[0]
    

In [13]:
nearestNeighborsGlove(embeddings, "the")

['which', 'part', 'of', 'in']

In [14]:
nearestNeighborsGlove(embeddings, "messi")

['ronaldinho', 'rivaldo', 'ronaldo', 'figo']

In [15]:
nearestNeighborsGlove(embeddings, "barcelona", k=10)

['madrid',
 'valencia',
 'porto',
 'monaco',
 'marseille',
 'milan',
 'sevilla',
 'juventus',
 'villarreal']

In [16]:
nearestNeighborsGlove(embeddings, "england", k=10)

['scotland',
 'wales',
 'ireland',
 'newcastle',
 'australia',
 'manchester',
 'zealand',
 'indies',
 'scottish']

In [17]:
riddle(embeddings, ["messi", "barcelona", "ronaldo"])

['ronaldo', 'ronaldinho', 'messi', 'striker', 'barcelona']

In [18]:
analogy(embeddings, "king", "man", "woman")

'king'

In [19]:
analogy(embeddings, "twig", "branch", "hand")

'fingernails'

In [20]:
tsne = TSNE(n_components=2, random_state=0)

In [25]:
words =  list(embeddings.keys())
# words = ["sister", "brother", "man", "woman", "uncle", "aunt"]
vectors = np.asarray([embeddings[word] for word in words])
Y = tsne.fit_transform(vectors[:1000])
# Y = tsne.fit_transform(vectors)




In [27]:
%matplotlib auto
for label, x, y in zip(words, Y[:100, 0], Y[:100, 1]):
    plt.scatter(x,y)
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()

Using matplotlib backend: TkAgg
