# Evaluation with word analogy using tools from benchmarks repository

In [1]:
from scipy.stats import spearmanr
from scipy import spatial
import numpy as np
import logging
from six import iteritems
import pandas as pd
from itertools import chain
import argparse
import os

In [2]:
embed_path = 'E:/Word2vec/embeddings_wiki.npy'
vocab_path = 'E:/Word2vec/vocab_300.txt'

In [3]:
with open(vocab_path, encoding="utf8") as f:
    vocab = f.readlines()
vocab = [w.strip() for w in vocab]

In [4]:
def lookup_table(word):
    return embed[vocab.index(word)]

In [5]:
embed = np.load(embed_path)

In [6]:
from six import string_types, text_type

In [7]:
# Works just as good with unicode chars
_delchars = [chr(c) for c in range(256)]
_delchars = [x for x in _delchars if not x.isalnum()]
_delchars.remove('\t')
_delchars.remove(' ')
_delchars.remove('-')
_delchars.remove('_')  # for instance phrases are joined in word2vec used this char
_delchars = ''.join(_delchars)
_delchars_table = dict((ord(char), None) for char in _delchars)

In [8]:
def standardize_string(s, clean_words=True, lower=True, language="english"):
    """
    Ensures common convention across code. Converts to utf-8 and removes non-alphanumeric characters
    Parameters
    ----------
    language: only "english" is now supported. If "english" will remove non-alphanumeric characters
    lower: if True will lower strńing.
    clean_words: if True will remove non alphanumeric characters (for instance '$', '#' or 'ł')
    Returns
    -------
    string: processed string
    """

    assert isinstance(s, string_types)

    if not isinstance(s, text_type):
        s = text_type(s, "utf-8")

    if language == "english":
        s = (s.lower() if lower else s)
        s = (s.translate(_delchars_table) if clean_words else s)
        return s
    else:
        raise NotImplementedError("Not implemented standarization for other languages")

In [9]:
class Bunch(dict):
    def __init__(self, *args, **kwargs):
        super(Bunch, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [10]:
def fetch_google_analogy():
    with open('E:/Word2vec/questions-words.txt', "r") as f:
        L = f.read().splitlines()

    # Simple 4 word analogy questions with categories
    questions = []
    answers = []
    category = []
    cat = None
    for l in L:
        if l.startswith(":"):
            cat =l.lower().split()[1]
        else:
            words =  standardize_string(l).split()
            questions.append(words[0:3])
            answers.append(words[3])
            category.append(cat)

    assert set(category) == set(['gram3-comparative', 'gram8-plural', 'capital-common-countries',
                                         'city-in-state', 'family', 'gram9-plural-verbs', 'gram2-opposite',
                                         'currency', 'gram4-superlative', 'gram6-nationality-adjective',
                                         'gram7-past-tense',
                                         'gram5-present-participle', 'capital-world', 'gram1-adjective-to-adverb'])


    syntactic = set([c for c in set(category) if c.startswith("gram")])
    category_high_level = []
    for cat in category:
         category_high_level.append("syntactic" if cat in syntactic else "semantic")

    # dtype=object for memory efficiency
    return Bunch(X=np.vstack(questions).astype("object"),
                 y=np.hstack(answers).astype("object"),
                 category=np.hstack(category).astype("object"),
                 category_high_level=np.hstack(category_high_level).astype("object"))


In [11]:
# Fetch analogy dataset
data = fetch_google_analogy()

word_embed = dict(zip(vocab, embed))

print("----------ANALOGY----------")

# Pick a sample of data and calculate answers
guessed = 0
subset = list(chain(range(50, 70), range(1000, 1020), range(4000, 4020), range(10000, 10020),
                      range(14000, 14020)))
for id in subset:
    w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]
    if w1 not in vocab or w2 not in vocab or w3 not in vocab:
        continue
    print("Question: {} is to {} as {} is to ?".format(w1, w2, w3))
    
    print("Answer: " + data.y[id])
    s = lookup_table(w2) - lookup_table(w1) + lookup_table(w3)
    best_match = 0.
    best_index = 0

    for i, (w, e) in enumerate(word_embed.items()):
        if w == w1 or w == w2 or w == w3:
            continue
        cosine_sim = 1 - spatial.distance.cosine(s, e)
        if cosine_sim >= best_match:
            best_match = cosine_sim
            best_index = i

    print("Predicted: ", vocab[best_index])
    if vocab[best_index] == data.y[id]:
        guessed += 1

print("Questions correctly answered: {} / {}".format(guessed, len(subset)))

----------ANALOGY----------
Question: bangkok is to thailand as havana is to ?
Answer: cuba
Predicted:  plans
Question: bangkok is to thailand as helsinki is to ?
Answer: finland
Predicted:  throughout
Question: bangkok is to thailand as islamabad is to ?
Answer: pakistan
Predicted:  critical
Question: bangkok is to thailand as kabul is to ?
Answer: afghanistan
Predicted:  part
Question: bangkok is to thailand as london is to ?
Answer: england
Predicted:  australia
Question: bangkok is to thailand as madrid is to ?
Answer: spain
Predicted:  spain
Question: bangkok is to thailand as moscow is to ?
Answer: russia
Predicted:  post
Question: bangkok is to thailand as oslo is to ?
Answer: norway
Predicted:  critical
Question: bangkok is to thailand as ottawa is to ?
Answer: canada
Predicted:  australia
Question: bangkok is to thailand as paris is to ?
Answer: france
Predicted:  france
Question: bangkok is to thailand as rome is to ?
Answer: italy
Predicted:  italy
Question: bangkok is to th