In [1]:
import pickle

import numpy as np
import matplotlib.pyplot as plt

In [9]:
# Normalizing the sorted_MX and sorted_MY
def normalize(X):
	# length normalization
	x = X/np.linalg.norm(X, axis = 1).reshape(-1, 1)
	# mean centering each dimension
	x = x - x.mean(axis = 0)
	# length normalizing the mean centered data
	x = x/np.linalg.norm(x, axis = 1).reshape(-1, 1)
	return x

# Loading the Data

For this to work, we need the:
1. Embedding in Hindi, English
2. Vocabulary used to generate the Embedding
3. A hindi-english dictionary

In [22]:
def create_idx2word_dict(vocab):
	idx2word = {}
	for i in vocab:
		idx2word[vocab[i]] = i

	return idx2word

# Building Word Pairs
with open('../data/en-hi.txt') as f:
    word_pairs = []
    for line in f:
        ls = line.split('\t')
        ls[1] = ls[1][:-1]
        word_pairs.append(ls)
word_pairs = np.array(word_pairs)


# Building vocab-index maps
hindi_vocab = pickle.load(open("./hindi_vocab.pkl", "rb"))
english_vocab = pickle.load(open("./english_vocab.pkl", "rb"))
hindi_idx2word = create_idx2word_dict(hindi_vocab)
english_idx2word = create_idx2word_dict(english_vocab)

# with open('ywy_without_csls_without_improvement.pkl', 'rb') as f:
# with open('ywy_with_csls_without_improvement.pkl', 'rb') as f:
# with open('ywy_without_csls_with_improvement.pkl', 'rb') as f:
with open('ywy_with_csls_with_improvement.pkl', 'rb') as f:
    embed_en = pickle.load(f)

# with open('xwx_without_csls_without_improvement.pkl', 'rb') as f:
# with open('xwx_with_csls_without_improvement.pkl', 'rb') as f:
# with open('xwx_without_csls_with_improvement.pkl', 'rb') as f:
with open('xwx_with_csls_with_improvement.pkl', 'rb') as f:
    embed_hi = pickle.load(f)
    
embed_en = normalize(embed_en)
embed_hi = normalize(embed_hi)

In [3]:
word_pairs

array([['and', 'और'],
       ['was', 'था'],
       ['was', 'थी'],
       ...,
       ['pattabhi', 'पट्टाभि'],
       ['golmud', 'गोलमुद'],
       ['folliculitis', 'folliculitis']], dtype='<U27')

# Intrinsic Measures

## Translation Similarity test
Measuring the average cosine similarity between the embeddings of translation word pairs in English and Hindi

In [23]:
cos_all = []
for pair in word_pairs:
    
    if (pair[0] not in english_vocab) or (pair[1] not in hindi_vocab):
        continue
    
    cos_all.append(abs(np.dot(
        embed_en[english_vocab[pair[0]]],
        embed_hi[hindi_vocab[pair[1]]],
    )))
    
cos_all = np.array(cos_all)
print("Coverage: ", len(cos_all)/len(word_pairs))
print("Accuracy: ", np.mean(cos_all))

Coverage:  0.15125192956751524
Accuracy:  0.4279327308203106


In [8]:
np.dot(
        embed_en[english_vocab[word_pairs[0][0]]],
        embed_hi[hindi_vocab[word_pairs[0][1]]],
    )

-74.20290503293218