In [1]:
import pdb
import pickle
import string
import matplotlib.pyplot as plt
import nltk
import numpy as np
import scipy
import sklearn
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

from utils import (cosine_similarity, get_dict,
                   process_tweet)

In [2]:
en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))

In [3]:
# loading the english to french dictionaries
en_fr_train = get_dict('en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_train))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 5000


In [4]:
def get_matrices(en_fr, french_vecs, english_vecs):
    """
    Input:
        en_fr: English to French dictionary
        french_vecs: French words to their corresponding word embeddings.
        english_vecs: English words to their corresponding word embeddings.
    Output: 
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the projection matrix that minimizes the F norm ||X R -Y||^2.
    """

    # X_l and Y_l are lists of the english and french word embeddings
    X_l = list()
    Y_l = list()

    # get the words (the keys in the dictionary) and store in a set()
    english_set = set(english_vecs.keys())
    french_set = set(french_vecs.keys())

    # store the french words that are part of the english-french dictionary (these are the values of the dictionary)
    french_words = set(en_fr.values())

    # loop through all english, french word pairs in the english french dictionary
    for en_word, fr_word in en_fr.items():
        if fr_word in french_set and en_word in english_set:
            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]
            X_l.append(en_vec)
            Y_l.append(fr_vec)

    X = np.vstack(X_l) 
    Y = np.vstack(Y_l) 

    return X, Y


In [13]:
X_train, Y_train = get_matrices(
    en_fr_train, fr_embeddings_subset, en_embeddings_subset)
X_val, Y_val = get_matrices(
    en_fr_test, fr_embeddings_subset, en_embeddings_subset)

In [14]:
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)

(4932, 300)
(4932, 300)
(1438, 300)
(1438, 300)


In [7]:
def compute_loss(X, Y, R):
    '''
    Inputs: 
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings.
    Outputs:
        L: a matrix of dimension (m,n) - the value of the loss function for given X, Y and R.
    '''
    m = X.shape[0]
    diff = np.dot(X,R) - Y
    diff_squared = np.square(diff)
    sum_diff_squared = np.sum(diff_squared)
    loss = sum_diff_squared/m
    return loss

In [8]:
def compute_gradient(X, Y, R):
    '''
    Inputs: 
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings.
    Outputs:
        g: a scalar value - gradient of the loss function L for given X, Y and R.
    '''
    m = X.shape[0]
    gradient = 2/m*np.dot(X.T,(np.dot(X,R) - Y))
    return gradient


In [9]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):
    '''
    Inputs:
        X: a matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.
        train_steps: positive int - describes how many steps will gradient descent algorithm do.
        learning_rate: positive float - describes how big steps will  gradient descent algorithm do.
    Outputs:
        R: a matrix of dimension (n,n) - the projection matrix that minimizes the F norm ||X R -Y||^2
    '''
    R = np.random.rand(X.shape[1], X.shape[1])
    for i in range(train_steps):
        if i % 25 == 0:
            print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        gradient = compute_gradient(X, Y, R)
        R -= learning_rate*gradient
    return R


In [17]:
R_train = align_embeddings(X_train, Y_train, train_steps=500, learning_rate=0.8)

loss at iteration 0 is: 971.0723
loss at iteration 25 is: 97.7551
loss at iteration 50 is: 26.7977
loss at iteration 75 is: 9.7778
loss at iteration 100 is: 4.3753
loss at iteration 125 is: 2.3290
loss at iteration 150 is: 1.4500
loss at iteration 175 is: 1.0359
loss at iteration 200 is: 0.8270
loss at iteration 225 is: 0.7161
loss at iteration 250 is: 0.6546
loss at iteration 275 is: 0.6195
loss at iteration 300 is: 0.5988
loss at iteration 325 is: 0.5863
loss at iteration 350 is: 0.5786
loss at iteration 375 is: 0.5738
loss at iteration 400 is: 0.5707
loss at iteration 425 is: 0.5687
loss at iteration 450 is: 0.5674
loss at iteration 475 is: 0.5666


In [18]:
def nearest_neighbor(v, candidates, k=1):
    """
    Input:
      - v, the vector you are going find the nearest neighbor for
      - candidates: a set of vectors where we will find the neighbors
      - k: top k nearest neighbors to find
    Output:
      - k_idx: the indices of the top k closest vectors in sorted form
    """
    similarity_l = []
    for row in candidates:
        cos_similarity = cosine_similarity(v,row)
        similarity_l.append(cos_similarity)    
    sorted_ids = np.argsort(similarity_l) 
    k_idx = sorted_ids[-k:]
    return k_idx

In [19]:
def test_vocabulary(X, Y, R):
    '''
    Input:
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the transform matrix which translates word embeddings from
        English to French word vector space.
    Output:
        accuracy: for the English to French capitals
    '''
    pred = np.dot(X,R)
    num_correct = 0
    for i in range(len(pred)):
        pred_idx = nearest_neighbor(pred[i], Y, k=1)
        if pred_idx == i:
            num_correct += 1
    accuracy = num_correct/X.shape[0]
    return accuracy

In [20]:
acc = test_vocabulary(X_val, Y_val, R_train)
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.555


In [None]:
acc = test_vocabulary(X_train, Y_train, R_train)
print(f"accuracy on train set is {acc:.3f}")