# Evaluation with Spearman Correlation

In [1]:
from scipy.stats import spearmanr
from scipy import spatial
import numpy as np
import logging
from six import iteritems
import pandas as pd
from itertools import chain
import argparse
import os

In [2]:
embed_path = 'E:/Word2vec/embeddings_wiki.npy'
vocab_path = 'E:/Word2vec/vocab_300.txt'

In [3]:
with open(vocab_path, encoding="utf8") as f:
    vocab = f.readlines()
vocab = [w.strip() for w in vocab]

In [4]:
def lookup_table(word):
    return embed[vocab.index(word)]

In [5]:
embed = np.load(embed_path)

In [6]:
def _read_as_pd(file):
    return pd.read_csv(file)

In [7]:
data = _read_as_pd('E:/Word2vec/combined.csv')

In [8]:
X = data.values[:, 0:2]

In [9]:
y = data.values[:, 2].astype(np.float)

In [10]:
spearman_errors = []
cosine_errors = []
name = 'WS353'
print("----------SIMILARITY----------")
spearman_err = 0
cosine_err = 0
analogies = 0
for i in range(len(X)):
    # get the word from WS353 dataset
    word1, word2 = X[i][0], X[i][1]
    if word1 not in vocab or word2 not in vocab:
        continue
    # look up word pairs in embedding
    spearman_corr, _ = spearmanr(lookup_table(word1), lookup_table(word2))
    # accumulate the spearman correlation
    spearman_corr = abs(spearman_corr)
    spearman_err += abs(spearman_corr - y[i] / 10)
    # compute cosine similarity
    cosine_sim = 1 - spatial.distance.cosine(lookup_table(word1), lookup_table(word2))
    cosine_err += abs(cosine_sim - y[i] / 10)
    # print(word1, word2, data.y[i], cosine_sim)

    analogies += 1
spearman_err = 1 - spearman_err / analogies
cosine_err = 1 - cosine_err / analogies
spearman_errors.append(spearman_err)
cosine_errors.append(cosine_err)
print("Spearman correlation error on {} dataset: {}".format(name, spearman_err))
print("Cosine similarity error on {} dataset: {}".format(name, cosine_err))

----------SIMILARITY----------
Spearman correlation error on WS353 dataset: 0.7405678952179118
Cosine similarity error on WS353 dataset: 0.7430807452890412
