The goal is to check that the vector result of *king - man + woman* is close to *queen* vector

## Try with a spaCy pretrained embedding

In [18]:
import spacy
import spacy.cli
from scipy import spatial
# we dowload a nlp english model (with a pre-trained 300-dimension embedding) 
spacy.cli.download("en_core_web_md")
nlp = spacy.load('en_core_web_md')

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[33mDEPRECATION: nb-black 1.0.7 has a non-standard dependency specifier black>='19.3'; python_version >= "3.6". pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of nb-black or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


spaCy allows to compute directly a pre-trained 300-dimension embedding for every word


In [19]:
king = nlp.vocab['king']
king.vector

array([-1.1296e-01, -4.1865e+00, -1.8453e+00,  3.0781e-01,  2.4956e+00,
        9.6267e-01, -1.8161e+00,  4.4655e+00, -2.8210e+00,  9.7090e-01,
        1.3542e+01,  4.3195e-01, -5.3098e+00,  4.7098e+00,  2.9030e+00,
        1.5588e+00,  6.0064e+00, -3.0345e+00,  1.0626e+00, -7.7197e-01,
       -5.4771e+00, -9.7380e-01, -4.4345e+00,  5.8367e+00,  2.4302e+00,
       -3.9408e+00, -9.1862e-01, -4.9124e+00,  1.4591e+00, -7.2772e-01,
        3.4957e+00, -4.0077e+00, -1.8354e+00, -4.1052e+00,  4.9211e+00,
       -9.7053e-01,  1.9223e+00,  5.2605e+00,  1.6086e+00,  7.1328e-01,
       -1.2146e+00, -1.9869e+00,  8.0265e-01,  2.9298e+00,  7.2985e-01,
       -6.2892e-01, -1.7082e+00,  1.9893e+00,  4.7529e-01,  3.2264e+00,
       -3.9215e+00,  4.6556e+00,  1.3475e+00, -1.0979e+00, -3.0365e+00,
        1.5815e+00,  2.2835e+00, -4.0616e+00,  2.5730e+00,  4.0618e+00,
        9.5438e-01, -6.2563e+00,  5.6463e+00, -3.8933e+00,  4.4076e+00,
        2.0517e+00, -6.6906e+00, -6.9448e+00,  6.0371e+00,  9.30

In [20]:
king.vector.shape

(300,)

In [36]:
# Question 1: Compute the vector "king - man + woman" and try to show that the result is close to the vector representation of the word "queen" ;
# a good way to do it is, for example, to find the 10 closest word (among the nlp.vocab words) from the results of "king - man + woman" and to show
# that "queen" is one of them (if not the best)

# The distance we need for that is the cosine similarity, it can be define from the spatial.distance.cosine function imported from the scipy library
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

# Start the exercice here
# Hint: use a loop on nlp.vocab (all the words defined in spaCy vocabulary) ; for each "word" in the vocabulary you can check if the word has an embedding vector ("word.has_vector"), if the word is in
# lower case ("word.is_lower") and is alphanumeric ("word.is_alpha"). Try to consider only the relevant words for the exercice
king = nlp("king")
man = nlp("man")
woman = nlp("woman")
queen = king.vector - man.vector + woman.vector
closest_words = []
for word in nlp.vocab:
    if word.has_vector and word.is_lower and word.is_alpha:
        similarity = cosine_similarity(queen, word.vector)
        closest_words.append((word.text, similarity))


closest_words = sorted(closest_words, key=lambda x: -x[1])

for i, (word, similarity) in enumerate(closest_words[:10]):
    if word == "queen":
        print(f"'queen' is one of the 10 closest words at position {i+1}")
        break
print('queen is not one of the 10 closest words at position')


queen is not one of the 10 closest words at position


## Try with a pretrained Word2Vec embedding model

**Important** To prevent RAM crash in the execution environment, please restart from here the running environment (Execution -> Restart the running environment)

In [22]:
import gensim# Load pretrained vectors from Google
from gensim.models import KeyedVectors

We load the pre-trained glove vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased embedding models (100-dimension embedding)

In [23]:
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")

In [24]:
king = word_vectors['king']

print(king)

[-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881
  0.16483  -0.98878

In [25]:
king.shape

(100,)

In [35]:
# Question 2: This time with the GoogleNews embedding model, try to show once again that "king - man + woman" is close to the vector representation of the word "queen" ;
# Hint: There is a pre-defined function in the gensim "word_vectors" object (define just above) that allows to get this result quite easily

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = word_vectors['king']
man = word_vectors['man']
woman = word_vectors['woman']
queen = king - man + woman
closest_words = []
for word in word_vectors:
    similarity = cosine_similarity(queen, word)

    closest_words.append((word, similarity))



closest_words = sorted(closest_words, key=lambda x: -x[1])
print(closest_words[:10])
# for i, (word, similarity) in enumerate(closest_words[:10]):
#     if word == "queen":
#         print(f"'queen' is one of the 10 closest words at position {i+1}")
#         break
# print('queen is not one of the 10 closest words at position')

KeyboardInterrupt: 

## Try with fastText embedding

**Important** To prevent RAM crash in the execution environment, please restart from here the running environment (Execution -> Restart the running environment)

In [None]:
#Download, extract and load Fasttext word embedding model
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip /content/cc.en.300.bin.gz
!pip install fasttext

Load the english fastText model

In [None]:
import fasttext 

model = fasttext.load_model("/content/cc.en.300.bin")

In [None]:
model.get_word_vector("king")

It is possible to get directly the nearest neighbors of a specific word (or even n-gram)

In [None]:
model.get_nearest_neighbors("king")

In [None]:
# Question 3: This time with the fastText embedding model, try to show once again that "king - man + woman" is close to the vector representation of the word "queen" ;
# Hint: There is a pre-defined function in the fastText model, 'get_analogies', that allows to get this result quite easily

In [39]:
-1.75/4+0+(-3.75/4)*2

-2.3125

In [41]:
-1/4 - 2.75/4 - 3/2

-2.4375

In [42]:
11/16

0.6875