In [0]:
import numpy as np
import scipy
from scipy.stats import *

In [26]:
# mount Google drive

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
'''
annotated_pairs data set is from followinf paper:
Lev Finkelstein, Evgeniy Gabrilovich, Yossi Matias, Ehud Rivlin, Zach Solan, Gadi Wolfman, and Eytan Ruppin. 2001. Placing search in context: the concept revisited. In Proc. of WWW.
'''

embeddings_source = '/content/gdrive/My Drive/NLP Class/WGL/embeddings/word2vec_original_15k_300d_train.txt'
annotated_pairs = '/content/gdrive/My Drive/NLP Class/WGL/human_sim.txt'

In [0]:
##### 1) loading the data

def loadData():
	"loads in the pre-trained embeddings; returns a dictionary mapping words to vectors"
	ret = {}
	data = open(embeddings_source).readlines()
	for row in data:
		word = row.strip().split(' ')[0]
		vals = row.strip().split(' ')[1:]
		vals = np.array( [float(val) for val in vals] )
		ret[word] = vals
	return ret

def loadTestData():
  "loads in annotated pairs"
  data = {}
  tmp = open(annotated_pairs).readlines()
  data['words'] = [ row.strip().split('\t')[0:2] for i, row in enumerate(tmp) if i!=0 ]
  data['sim_scores'] = [ float(row.strip().split('\t')[2]) for i, row in enumerate(tmp) if i!=0 ]
  return data

In [0]:
##### 2) getting similarity scores for word embeddings

def getSimilarity(e1, e2):
	"computes cosine similarity (cosine of angle between embedding vectors)"
	return np.sum(e1 * e2)/( np.sqrt(np.sum(e1*e1)) * np.sqrt(np.sum(e2*e2)))
 
def getSimilarityScoreForWords(w1,w2):
  global embeddings
  if (w2 not in embeddings) or (w1 not in embeddings):
    return -1
  else:
    finalVector_w1 = embeddings[w1]
    finalVector_w2 = embeddings[w2]
    return getSimilarity(finalVector_w1, finalVector_w2)

In [0]:
##### 3) computing correlation between human-annotated scores and cosine similarities for word embeddings

def evaluate():
  global embeddings
  embeddings = loadData()
  data = loadTestData()
  print("words = ", len(data['words']))
  print("scores = ", len(data['sim_scores']))
  pred_scores = []
  invalid = 0

  # loop through human annotated data, returning matrix where col 0 is cosine similarity of embeddings, and col 1 is human score
  pred_scores = [[getSimilarityScoreForWords(w1w2[0],w1w2[1]), human_score] for w1w2, human_score in zip(data['words'], data['sim_scores'])]

  # delete word pairs which couldn't be found in embedding set
  pred_scores = np.array( [ val for val in pred_scores if val[0] != -1])

  spearman_rank_coeff, sp_rho = spearmanr(pred_scores[:,0], pred_scores[:,1])
  print("total, valid, spearman_rank_coeff, sp_rho", len(data['words']),len(pred_scores), spearman_rank_coeff, sp_rho)

In [47]:
evaluate()

words =  353
scores =  353
total, valid, spearman_rank_coeff, sp_rho 353 292 0.6842646500002515 1.161001967489536e-41
