# Naive Machine Translation and Locality Sensitive Hashing (LSH)

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from utils_arnab import get_en_to_fr_dict, get_word_embedding_matrices, distance_cosine_score, get_indices_to_word_dict

### Load the word embeddings data for English and French words

In [2]:
# Load the english and french word embedding subset models
en_embeddings = pickle.load(open('en_embeddings.p', 'rb'))
fr_embeddings = pickle.load(open('fr_embeddings.p', 'rb'))

In [3]:
# check the word embeddings upto 5 dimensions
print('the: ', en_embeddings['the'][:5])
print('la: ', fr_embeddings['la'][:5])

the:  [ 0.08007812  0.10498047  0.04980469  0.0534668  -0.06738281]
la:  [-0.0061825  -0.00094387 -0.00882648  0.0324623  -0.0218281 ]


In [4]:
# check the dimensions of English and french word embeddings
print('English word embedding dimension:', len(en_embeddings['the']))
print('French word embedding dimension:', len(fr_embeddings['la']))

English word embedding dimension: 300
French word embedding dimension: 300


### Build the English to French word mapping dictionary

In [5]:
# loading the english to french dictionaries
en_fr_train = get_en_to_fr_dict('en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_en_to_fr_dict('en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_test))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 1500


In [6]:
# build master english to french mapping
keys = list(en_fr_train.keys()) + list(en_fr_test.keys())
values = list(en_fr_train.values()) + list(en_fr_test.values())
master_en_fr_dict = {key: value for key, value in zip(keys, values)}

In [7]:
# check the master to frech dictionary set
print('First 5 items: ',list(master_en_fr_dict.items())[:5])
print('Last 5 items: ',list(master_en_fr_dict.items())[-5:])

First 5 items:  [('the', 'la'), ('and', 'et'), ('was', 'était'), ('for', 'pour'), ('that', 'cela')]
Last 5 items:  [('madonna', 'madonna'), ('worcester', 'worcester'), ('cooperative', 'coopératif'), ('substantially', 'sensiblement'), ('winston', 'winston')]


In [8]:
# check the english to french dictionary
list(en_fr_train.items())[:5]

[('the', 'la'),
 ('and', 'et'),
 ('was', 'était'),
 ('for', 'pour'),
 ('that', 'cela')]

### Generate embedding matrices

<div style="width:image width px; font-size:100%; text-align:center;">
<img src='X_to_Y.jpg' alt="alternate text" width="width" height="height" style="width:800px;height:200px;" /><center>Figure 2 </center></h1></div>

In [9]:
# get the word embedding matrices for the english and french words from the train set
X_train, Y_train = get_word_embedding_matrices(
    en_fr_train, en_embeddings, fr_embeddings)

In [10]:
X_train[1][:5]

array([ 0.02600098, -0.00189209,  0.18554688, -0.05175781,  0.00512695])

### Finding the Translation Matrix to translate English words to French words

<div style="width:image width px; font-size:100%; text-align:center;"><img src='e_to_f.jpg' alt="alternate text" width="width" height="height" style="width:700px;height:200px;" /> <center>Figure 2</center> </div>

Given English and French Word Embedding Matrices, $ {\mathbf{X}}\ and\ {\mathbf{Y}} $ -

We need to find a matrix $ {\mathbf{R}} $ that minimizes the following equation. 

$$\arg \min _{\mathbf{R}}\| \mathbf{X R} - \mathbf{Y}\|_{F}\tag{1} $$

We will optimize below equation instead of the above equation,
$$ \frac{1}{m} \|  \mathbf{X R} - \mathbf{Y} \|_{F}^{2}$$

where $m$ is the number of samples of $ {\mathbf{X}} $

In [11]:
# loss function to optimize
def compute_loss(X, Y, R):
    """
    computes the loss defined as the square of the frobenius norm of matrix [ XR - Y ] divided by the number of rows of X

    Params:
    ----------
    X: numpy array
        english embedding matrix
    Y: numpy array
        french embedding matrix
    R: numpy array
        transformation matrix

    Returns:
    ----------
    loss: float
        defined as the square of the frobenius norm of matrix [ XR - Y ] divided by the number of rows of X
    """

    m = X.shape[0]

    loss = 1/m * (np.linalg.norm(np.dot(X, R) - Y)**2)
    return loss

### Computing the gradient of loss with respect to transform matrix R

* The formula for the gradient of the loss function $𝐿(𝑋,𝑌,𝑅)$ is:

$$\frac{d}{dR}𝐿(𝑋,𝑌,𝑅)=\frac{d}{dR}\Big(\frac{1}{m}\| X R -Y\|_{F}^{2}\Big) = \frac{2}{m}X^{T} (X R - Y)$$

In [12]:
def compute_gradient(X, Y, R):
    """
    computes the gradient of the loss function where the loss function is defined as the 
    square of the frobenius norm of matrix [ XR - Y ] divided by the number of rows of X

    Params:
    ----------
    X: numpy array
        english embedding matrix
    Y: numpy array
        french embedding matrix
    R: numpy array
        transformation matrix

    Returns:
    ----------
    grad: numpy array
        gradient of the loss function F-norm ||X R -Y||^2 for given X, Y and R
    """
    
    m = X.shape[0]

    grad = 2/m * np.dot(X.T, (np.dot(X, R) - Y))
    return grad

### Finding the optimal R with gradient descent algorithm

* Update $R$ with the formula:
$$R_{\text{new}}= R_{\text{old}}-\alpha g$$

where, $g$ is the gradient of the loss with respect to the matrix $R$ and $\alpha$ is the learning rate, which is a scalar.

In [13]:
def gradient_descent(X, Y, train_steps=400, learning_rate=0.0003):
    """
    Computes the optimal transform matrix (R) by minimizing the squared frobenius loss function using gradient descent algorithm

    Params:
    ----------
        X: a numpy matrix of dimension (m,n) where the columns are the English embeddings.
        Y: a numpy matrix of dimension (m,n) where the columns correspong to the French embeddings.
        train_steps: the number of epochs for the gradient descent algorithm
        learning_rate: the learning rate used in the gradient descent update of R
    
    Returns:
    ----------
        R: the optimal transform matrix of dimension (n,n) by minimizing F-norm ||X R -Y||^2.
    """
    np.random.seed(200)
    
    dim = X.shape[1]  # get the embedding dimension

    R = np.random.rand(dim, dim)  # initialize the transform matrix

    for i in range(train_steps):
        if i % 50 == 0:
            print(f"Loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")
        grad = compute_gradient(X, Y, R)
        # update the transform matrix
        R = R - learning_rate * grad
    return R
   

In [14]:
R_train = gradient_descent(X_train, Y_train, train_steps=1000, learning_rate=0.85)

Loss at iteration 0 is: 966.4205
Loss at iteration 50 is: 23.3787
Loss at iteration 100 is: 3.7070
Loss at iteration 150 is: 1.2678
Loss at iteration 200 is: 0.7658
Loss at iteration 250 is: 0.6313
Loss at iteration 300 is: 0.5892
Loss at iteration 350 is: 0.5744
Loss at iteration 400 is: 0.5688
Loss at iteration 450 is: 0.5665
Loss at iteration 500 is: 0.5655
Loss at iteration 550 is: 0.5651
Loss at iteration 600 is: 0.5649
Loss at iteration 650 is: 0.5648
Loss at iteration 700 is: 0.5647
Loss at iteration 750 is: 0.5647
Loss at iteration 800 is: 0.5647
Loss at iteration 850 is: 0.5647
Loss at iteration 900 is: 0.5647
Loss at iteration 950 is: 0.5647


### Testing the Translation

Use a KNN algorithm to map the approximated embedding vector $\hat{Y}$ = $XR \xrightarrow{\text{to an actual vector in}}$  $Y$ vector space 

We will use below metric as distance measure for the KNN - 
$$d_{\text{cos}}(u,v)=1-\cos(u,v)$$

In [15]:
def nearest_neighbor(v, candidates, k=1, metric=distance_cosine_score):
    """
    compute the nearest neighbor of the approximated french word vector v = ( X * R ) in the acutal Y vector space

    Params:
    ----------
    v: numpy array
        the approximated french word row vector.
    candidates: numpy array
        a list of candidate vectors in Y space from which to search the nearest neighbors with respect to v.
    k: int
        number representing the top k nearest neighbors of v to search for.
    metric: function
        callable function used as a distance metric, default is cosine similarity

    Returns:
    ----------
    knn_idx: numpy array
        list of indices of k nearest neighbors found in Y vector space with respect to v.
    """

    distance_scores = []
    for vec in candidates:
        score = metric(v, vec)
        distance_scores.append(score)
    
    knn_idx = np.argsort(distance_scores)[:k]

    return knn_idx        



In [16]:
# test knn
v = np.array([1, 0, 1])
candidates = np.array([[1, 0, 5], [-2, 5, 3], [2, 0, 1], [6, -9, 5], [9, 9, 9]])
candidates[nearest_neighbor(v, candidates, 3)]

array([[2, 0, 1],
       [1, 0, 5],
       [9, 9, 9]])

### Calculate Accuracy
$$\text{accuracy}=\frac{\#(\text{correct predictions})}{\#(\text{total predictions})}$$

In [17]:
def test_vocabulary(X, Y, R):
    """
    calculates the accuracy of translations from X to Y by R

    Params:
    ----------
    X: numpy array
        english embedding matrix
    Y: numpy array
        french embedding matrix
    R: numpy array
        transformation matrix

    Returns:
    ----------
    acc: float
        accuracy is calculated by checking if the indices match b/w XR and Y. 
        The result is stored in an array and averaged to produce the accuracy score
    """
    acc = 0
    matched_idx = []
    for idx, vec in enumerate(X):
        y_pred = np.dot(vec, R)
        if idx == nearest_neighbor(y_pred, Y, k=1).item():
            matched_idx.append(idx)
            acc += 1
    acc = acc / X.shape[0]

    return acc, matched_idx

### Check Accuracy on 1000 Samples of Train Data

In [18]:
acc_train, matched_idx_train = test_vocabulary(X_train[:1000,:], Y_train[:1000,:], R_train)
print(f'Accuracy on Train Data: {acc_train:.2f}')

Accuracy on Train Data: 0.59


In [19]:
matched_idx_train[:5]

[1, 2, 8, 10, 14]

### Check Accuracy on Test Data

In [20]:
X_val, Y_val = get_word_embedding_matrices(en_fr_test, en_embeddings, fr_embeddings)

In [21]:
acc_test, matched_idx_test = test_vocabulary(X_val, Y_val, R_train)
print(f'Accuracy on Test Data: {acc_test:.2f}')

Accuracy on Test Data: 0.56


In [22]:
matched_idx_test[:5]

[0, 2, 6, 8, 9]

### Check the translation predictions made by the model

In [23]:
# create indices to word dictionary mapping from the english to french master database map
en_iw, fr_iw = get_indices_to_word_dict(master_en_fr_dict, en_embeddings, fr_embeddings)
# en_iw, fr_iw = get_indices_to_word_dict(en_fr_test, en_embeddings, fr_embeddings)

In [24]:
list(en_iw.items())[:7]

[(0, 'the'),
 (1, 'was'),
 (2, 'for'),
 (3, 'that'),
 (4, 'with'),
 (5, 'from'),
 (6, 'this')]

In [25]:
list(fr_iw.items())[:7]

[(0, 'la'),
 (1, 'était'),
 (2, 'pour'),
 (3, 'cela'),
 (4, 'avec'),
 (5, 'depuis'),
 (6, 'ce')]

In [62]:
def translate_word(en_word):
    """
    Translates the given english word as parameter to a Frech word.

    Params:
    ----------
    en_word: str
        English word that would be translated to French word.
    
    Returns:
    ----------
    None
    """

    x = en_embeddings[en_word]
    y_pred = np.dot(x, R_train)       # approximation of en_word to fr_word
    Y = np.vstack((Y_train, Y_val))     # build the complete french embedding vector using both train and test data set as candidates
    for idx in nearest_neighbor(y_pred, Y, k=1):
        print(en_word, " --> ",fr_iw[idx])


In [69]:
translate_word('circular')
translate_word('decorated')
translate_word('beautiful')
translate_word('hood')
translate_word('bicycle')



circular  -->  circulaires
decorated  -->  décoré
beautiful  -->  beauté
hood  -->  voiture
bicycle  -->  vélo


It can be observed that translations are sometimes errorneous given the accuracy of the model. Using a larger vocabulary rather than a subset and using a larger training data would improve accuracy of translations.