In [1]:
import numpy as np
import gensim

In [2]:
seed = 42
np.random.seed(seed)

# Parameters

In [3]:
experiment_name = "pca_v1"
test_data_path = "data/datasets/questions-words.txt"

In [4]:
input_file = f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt"
converted_file = f"data/embeddings/trained/{experiment_name}.glove.gensimFormat.6B.300d.txt"

The word analogy task consists of questions like,  “a is to b as c is to?” The dataset contains 19,544 such questions, divided into a semantic subset and a syntactic subset. The semantic questions are typically analogies about people or places, like “Athens is to Greece as Berlin is to?”.  The syntactic questions are typically analogies about verb tenses or forms of adjectives, for example “dance is to dancing as fly is to?”.  To correctly answer the question, the model should uniquely identify the missing term, with only an exact correspondence counted as a correct match. We answer the question “a is to b as c is to?” by finding the word `d` whose representation `w_d` is closest to `w_b − w_a + w_c` according to the cosine similarity.

https://nlp.stanford.edu/pubs/glove.pdf

# Word Analogy Score Calculation

#1 Taking a quick glance at our test set.

In [5]:
x = []
with open("data/datasets/questions-words.txt", "r") as fp:
    x = fp.read()

In [6]:
x.split('\n')[:10]

[': capital-common-countries',
 'Athens Greece Baghdad Iraq',
 'Athens Greece Bangkok Thailand',
 'Athens Greece Beijing China',
 'Athens Greece Berlin Germany',
 'Athens Greece Bern Switzerland',
 'Athens Greece Cairo Egypt',
 'Athens Greece Canberra Australia',
 'Athens Greece Hanoi Vietnam',
 'Athens Greece Havana Cuba']

In [7]:
del x

#2 Convert data into the format used by Gensim. [We use Gensim because it has a built-in function to calculate word analogy scores]

In [8]:
# Convert GloVe Vectors into a format usable by gensim
gensim.scripts.glove2word2vec.glove2word2vec(input_file, converted_file)

(400000, 150)

#3 Load the formatted data & calculate scores

In [9]:
glove = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format(converted_file)

In [10]:
results = glove.evaluate_word_analogies(test_data_path, restrict_vocab=400000)

In [11]:
syn_correct = 0
syn_incorrect = 0
sem_correct = 0
sem_incorrect = 0
for item in results[1]:
    correct = len(item['correct'])
    incorrect = len(item['incorrect'])
    if item['section'] == "Total accuracy":
        print(f"{item['section']}: {correct * 100/(correct+incorrect)}")
    elif item['section'][:4] == 'gram':
        sem_correct += correct
        sem_incorrect += incorrect
    else:
        syn_correct += correct
        syn_incorrect += incorrect
print(f"Semantic Accuracy: {sem_correct * 100/(sem_correct+sem_incorrect)}")
print(f"Syntactic Accuracy: {syn_correct * 100/(syn_correct+syn_incorrect)}")

Total accuracy: 66.82357756856324
Semantic Accuracy: 60.99297423887588
Syntactic Accuracy: 73.84147028977337
