In [34]:
from gensim.models import Word2Vec
import os
import numpy as np
english_model = Word2Vec.load(os.path.join('data', 'english_model_lemmatized'))
german_model = Word2Vec.load(os.path.join('data', 'german_model_lemmatized'))

In [35]:
from copy import deepcopy
def w2v_to_numpy(model):
    """ Convert the word2vec model (the embeddings) into numpy arrays.
    Also create and return the mapping of words to the row numbers.

    Parameters:
    ===========
    model (gensim.Word2Vec): a trained gensim model

    Returns:
    ========
    embeddings (numpy.ndarray): Embeddings of each word
    idx, iidx (tuple): idx is a dictionary mapping word to row number
                        iidx is a dictionary mapping row number to word
    """
    model.wv.fill_norms()
    embeddings = deepcopy(model.wv.get_normed_vectors())
    idx = {w: i for i, w in enumerate(model.wv.index_to_key)}
    iidx = {i: w for i, w in enumerate(model.wv.index_to_key)}
    return embeddings, (idx, iidx)

In [36]:
en_emb, en_item = w2v_to_numpy(english_model)
de_emb, de_item = w2v_to_numpy(german_model)

In [37]:
print(en_item[1])



In [38]:
def procrustes(A, B):
    """
    Solve the orthogonal Procrustes problem which finds the matrix R that
    best maps matrix A onto matrix B under orthogonal transformation.
    
    Parameters:
    A (numpy.ndarray): The source matrix.
    B (numpy.ndarray): The target matrix to map A onto.

    Returns:
    numpy.ndarray: The orthogonal matrix R.
    """
    # Compute the matrix product of A^T and B
    M = A.T @ B
    
    # Perform Singular Value Decomposition
    U, _, Vt = np.linalg.svd(M)
    
    # Compute R as U * V^T
    R = U @ Vt
    
    return R

In [39]:
train_path = os.path.join('train_test_data','train_set.txt')
test_path = os.path.join('train_test_data','test_set.txt')
with open(train_path, 'r') as f:
    train_set = f.read().splitlines()
with open(test_path, 'r') as f:
    test_set = f.read().splitlines()

In [40]:
len(train_set)

875

In [41]:
def create_words_and_indices(data):
    english_words = []
    german_words = []
    english_indices = []
    german_indices = []
    
    for item in data:
        parts = item.split()
        english_words.append(parts[0])
        german_words.append(parts[1])
        english_indices.append(int(parts[2]))
        german_indices.append(int(parts[3]))
    return english_words, german_words, english_indices, german_indices

In [42]:
en_words, de_words, en_indices, de_indices = create_words_and_indices(train_set)

In [43]:
def create_anchor_matrix(matrix, indices):
    """
    Create a matrix with only the anchor words
    """
    return matrix[indices]

In [44]:
en_anchor = create_anchor_matrix(en_emb, en_indices)
de_anchor = create_anchor_matrix(de_emb, de_indices)

In [45]:
print(en_anchor.shape)
print(de_anchor.shape)

(875, 50)
(875, 50)


In [46]:
# Since we want to map the English anchor words to the German anchor words,
p_matrix = procrustes(de_anchor, en_anchor)

In [47]:
de_english_aligned = de_emb @ p_matrix

In [48]:
# print l2 by row
print(np.linalg.norm(de_english_aligned, axis=1))
print(np.linalg.norm(de_english_aligned, axis=1).shape)
for item in np.linalg.norm(de_english_aligned, axis=1):
    if item < 0.999 or item > 1.001:
        print(item)
        print('error')

[1.         1.         0.99999994 ... 0.99999994 0.99999994 0.9999999 ]
(60960,)


In [49]:
# find closest 
def find_closest_words(aligned_vector, target_matrix, iidx, n=5):
    dot_product = aligned_vector @ target_matrix.T
    closest_indices = np.argsort(dot_product)[::-1][:n]
    return [iidx[i] for i in closest_indices]

find_closest_words(de_english_aligned[100], en_emb, en_item[1])
    

['security', 'safety', 'protection', 'justice', 'efficiency']

In [53]:
top_1_count = 0
total_count = 0
for i, j in enumerate(de_indices):
    aligned_vector = de_english_aligned[j]
    closest_word = find_closest_words(aligned_vector, en_emb, en_item[1], 1)[0]
    english_index = en_indices[i]
    gold_word = en_item[1][english_index]
    if closest_word == gold_word:
        print(f'Correctly aligned {gold_word} to {closest_word}')
        top_1_count += 1
    else:
        print(f'Incorrectly aligned {gold_word} to {closest_word}')
    total_count += 1
print(f'Top 1 accuracy: {top_1_count/total_count}')
print(f'Total count: {total_count}')

Incorrectly aligned 2002 to 2003
Correctly aligned government to government
Correctly aligned exclude to exclude
Incorrectly aligned vehicle to lorry
Correctly aligned project to project
Correctly aligned difficulty to difficulty
Correctly aligned politician to politician
Correctly aligned racism to racism
Incorrectly aligned west to western
Correctly aligned unnecessary to unnecessary
Correctly aligned 80 to 80
Correctly aligned investigation to investigation
Incorrectly aligned historical to historic
Correctly aligned five to five
Correctly aligned continent to continent
Incorrectly aligned train to imminent
Incorrectly aligned 27 to 25
Correctly aligned legislation to legislation
Correctly aligned union to union
Correctly aligned require to require
Correctly aligned early to early
Correctly aligned restrict to restrict
Correctly aligned direct to direct
Correctly aligned participation to participation
Correctly aligned immigration to immigration
Correctly aligned advocate to advocat

In [55]:
top_5_count = 0
total_count = 0
for i, j in enumerate(de_indices):
    alinged_vector = de_english_aligned[j]
    closest_words = find_closest_words(alinged_vector, en_emb, en_item[1], 5)
    english_index = en_indices[i]
    gold_word = en_item[1][english_index]
    if gold_word in closest_words:
        print(f'Correctly aligned {gold_word} to {closest_words}')
        top_5_count += 1
    else:
        print(f'Incorrectly aligned {gold_word} to {closest_words}')
    total_count += 1
print(f'Top 5 accuracy: {top_5_count/total_count}')

Correctly aligned 2002 to ['2003', '2002', '2001', '2005', '2006']
Correctly aligned government to ['government', 'authority', 'cypriots', 'counterpart', 'army']
Correctly aligned exclude to ['exclude', 'restrict', 'prohibit', 'remove', 'preclude']
Incorrectly aligned vehicle to ['lorry', 'motorcycle', 'tyre', 'appliance', 'truck']
Correctly aligned project to ['project', 'programme', 'fund', 'initiative', 'eit']
Correctly aligned difficulty to ['difficulty', 'problem', 'obstacle', 'difference', 'shortcoming']
Correctly aligned politician to ['politician', 'politic', 'elite', 'leader', 'businessman']
Correctly aligned racism to ['racism', 'xenophobia', 'homophobia', 'radicalisation', 'racist']
Incorrectly aligned west to ['western', 'balkan', 'maghreb', 'africa', 'neighbouring']
Correctly aligned unnecessary to ['unnecessary', 'unnecessarily', 'excessive', 'bureaucratic', 'pointless']
Correctly aligned 80 to ['80', '40', '70', '50', '60']
Correctly aligned investigation to ['investigat

In [52]:
path = os.path.join('words', 'german_top_1000_translated.txt')
with open(path, 'r') as f:
    german_top_1000_translated = f.read().splitlines()
# find duplicates
words = set()
for item in german_top_1000_translated:
    if item in words:
        print(item)
    words.add(item)


support
already
area
opinion
committee
president
necessary
however
first
therefore
future
commissioner
opinion
company
regulation
even
work
financial support
clearly
finally
represent
need
first
regarding
full
first
different
actually
currently
regulation
difficult
person
rapporteur
place
only
agreement
condition
today
position
special
matter
ask
time
help
close
notice
effort
application
take place
apply
because of
many
united
certainly
currently
express
opinion
agreement
just
principle
measure
in total
receive
knowledge
task
even
approach
agree
even
finally
step
consider
together
danger
service
recognize
demand
always
decision
use
change
wish
complete
responsibility
connection
location
strengthen
execution
third
past
meeting
need
use
absolutely
absolutely
Presidency
hope
need
agreement
decision
view
long
finally
experience
fact
participate
significant
determination
responsible
at the same time
opposite
relationship
grant
provide
result
regard
attempt
allow
high
force
