In [1]:
import numpy as np
import gensim

In [2]:
seed = 42
np.random.seed(seed)

The word analogy task consists of questions like,  “a is to b as c is to?” The dataset contains 19,544 such questions, divided into a semantic subset and a syntactic subset. The semantic questions are typically analogies about people or places, like “Athens is to Greece as Berlin is to?”.  The syntactic questions are typically analogies about verb tenses or forms of adjectives, for example “dance is to dancing as fly is to?”.  To correctly answer the question, the model should uniquely identify the missing term, with only an exact correspondence counted as a correct match. We answer the question “a is to b as c is to?” by finding the word `d` whose representation `w_d` is closest to `w_b − w_a + w_c` according to the cosine similarity.

https://nlp.stanford.edu/pubs/glove.pdf

#1 Taking a quick glance at our test set.

In [5]:
x = []
with open("data/datasets/questions-words.txt", "r") as fp:
    x = fp.read()

In [9]:
x.split('\n')

[': capital-common-countries',
 'Athens Greece Baghdad Iraq',
 'Athens Greece Bangkok Thailand',
 'Athens Greece Beijing China',
 'Athens Greece Berlin Germany',
 'Athens Greece Bern Switzerland',
 'Athens Greece Cairo Egypt',
 'Athens Greece Canberra Australia',
 'Athens Greece Hanoi Vietnam',
 'Athens Greece Havana Cuba',
 'Athens Greece Helsinki Finland',
 'Athens Greece Islamabad Pakistan',
 'Athens Greece Kabul Afghanistan',
 'Athens Greece London England',
 'Athens Greece Madrid Spain',
 'Athens Greece Moscow Russia',
 'Athens Greece Oslo Norway',
 'Athens Greece Ottawa Canada',
 'Athens Greece Paris France',
 'Athens Greece Rome Italy',
 'Athens Greece Stockholm Sweden',
 'Athens Greece Tehran Iran',
 'Athens Greece Tokyo Japan',
 'Baghdad Iraq Bangkok Thailand',
 'Baghdad Iraq Beijing China',
 'Baghdad Iraq Berlin Germany',
 'Baghdad Iraq Bern Switzerland',
 'Baghdad Iraq Cairo Egypt',
 'Baghdad Iraq Canberra Australia',
 'Baghdad Iraq Hanoi Vietnam',
 'Baghdad Iraq Havana Cuba

#2 Load up our untouched embeddings and see the results of Word Analogy

Reproduce the results from the original paper here to make sure.
https://nlp.stanford.edu/pubs/glove.pdf

    GloVe 300D 6B: Semantic: 77.4 Syntactic: 67.0 Total: 71.7

In [3]:
# Convert GloVe Vectors into a format usable by gensim
gensim.scripts.glove2word2vec.glove2word2vec(
    "data/embeddings/glove.6B.300d.txt", "data/embeddings/glove.gensimFormat.6B.300d.txt"
)

(400000, 300)

In [4]:
glove = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format(
    "data/embeddings/glove.gensimFormat.6B.300d.txt"
)

In [11]:
results = glove.evaluate_word_analogies("data/embeddings/questions-words.txt", restrict_vocab=400000)

In [32]:
syn_correct = 0
syn_incorrect = 0
sem_correct = 0
sem_incorrect = 0
for item in results[1]:
    correct = len(item['correct'])
    incorrect = len(item['incorrect'])
    if item['section'] == "Total accuracy":
        print(f"{item['section']}: {correct * 100/(correct+incorrect)}")
    elif item['section'][:4] == 'gram':
        sem_correct += correct
        sem_incorrect += incorrect
    else:
        syn_correct += correct
        syn_incorrect += incorrect
print(f"Semantic Accuracy: {sem_correct * 100/(sem_correct+sem_incorrect)}")
print(f"Syntactic Accuracy: {syn_correct * 100/(syn_correct+syn_incorrect)}")

Total accuracy: 71.73557101923863
Semantic Accuracy: 66.9976580796253
Syntactic Accuracy: 77.43826812492954


#3 Create our clipped embeddings, save it and see the results of Word Analogy

In [68]:
import re

In [67]:
clipped = []
with open("data/embeddings/glove.6B.300d.txt", "r", encoding='utf-8') as fp:
    for line in fp:
        line = line.split()
        word = line[0]
        vector = np.asarray(line[1:], 'float32').clip(min=-1.0, max=1.0)
        vector = re.split('\s+', str(vector)[1:-1].strip().replace('\n', ' '))
        if len(vector) != 300:
            print("BREAK")
            break
        final = " ".join([word] + vector)
        clipped.append(final)

In [71]:
with open("data/embeddings/glove.Clipped.6B.300d.txt", "w", encoding='utf-8') as fp:
    fp.write("\n".join(clipped))

In [72]:
# Convert GloVe Vectors into a format usable by gensim
gensim.scripts.glove2word2vec.glove2word2vec(
    "data/embeddings/glove.Clipped.6B.300d.txt", "data/embeddings/glove.Clipped.gensimFormat.6B.300d.txt"
)

(400000, 300)

In [73]:
glove = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format(
    "data/embeddings/glove.Clipped.gensimFormat.6B.300d.txt"
)

In [76]:
results2 = glove.evaluate_word_analogies("data/datasets//questions-words.txt", restrict_vocab=400000)

In [77]:
syn_correct = 0
syn_incorrect = 0
sem_correct = 0
sem_incorrect = 0
for item in results2[1]:
    correct = len(item['correct'])
    incorrect = len(item['incorrect'])
    if item['section'] == "Total accuracy":
        print(f"{item['section']}: {correct * 100/(correct+incorrect)}")
    elif item['section'][:4] == 'gram':
        sem_correct += correct
        sem_incorrect += incorrect
    else:
        syn_correct += correct
        syn_incorrect += incorrect
print(f"Semantic Accuracy: {sem_correct * 100/(sem_correct+sem_incorrect)}")
print(f"Syntactic Accuracy: {syn_correct * 100/(syn_correct+syn_incorrect)}")

Total accuracy: 71.57183790421612
Semantic Accuracy: 66.92271662763466
Syntactic Accuracy: 77.16766264516856
