In [1]:
# testing 
import io
import numpy as np
import fasttext

In [2]:
def load_vec(emb_path, nmax=50000):
    model = fasttext.load_model(emb_path)
    embeddings = model.get_output_matrix()
    words = model.get_labels()
    word2id = {w:i for i, w in enumerate(words)}
    id2word = {v: k for k, v in word2id.items()}
    
    #embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [3]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5, print_mode=True, pos='noun'):
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    if print_mode:
        print("Nearest neighbors of \"%s\":" % word)
        for i, idx in enumerate(k_best):
            print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))
    index = 0
    for idx in k_best:
        if pos in tgt_id2word[idx]:
            index = idx
            break
    return tgt_id2word[index], scores[index]

In [4]:
import torch
matrx_en_es = torch.from_numpy(torch.load('./dumped/debug/EN_ES/best_mapping.pth'))
matrx_en_es = matrx_en_es.detach().numpy()

matrx_es_fr = torch.from_numpy(torch.load('./dumped/debug/ES_FR/best_mapping.pth'))
matrx_es_fr = matrx_es_fr.detach().numpy()

In [5]:
en_path = '../corpus_en/model_en.bin'
nmax = 2000000  # maximum number of word embeddings to load
en_embeddings, en_id2word, en_word2id = load_vec(en_path, nmax)
#model = fasttext.load_model(en_path)



In [24]:
es_path = '../corpus_es/model_es.bin'
es_embeddings, es_id2word, es_word2id = load_vec(es_path, nmax)



In [6]:
fr_path = '../corpus_fr/model_fr.bin'
fr_embeddings, fr_id2word, fr_word2id = load_vec(fr_path, nmax)



## 1 test from sour model EN to get nearest neigbor of word dog_noun

In [26]:
en_embeddings.shape

(1221513, 300)

In [27]:
# printing nearest neighbors in the source space
#test with source en
src_word = 'dog_noun'
get_nn(src_word, en_embeddings, en_id2word, en_embeddings, en_id2word, K=5)

Nearest neighbors of "dog_noun":
1.0000 - dog_noun
0.6756 - cat_noun
0.6396 - puppy_noun
0.6345 - pet_noun
0.5783 - dog_properNoun


In [28]:
"""
mapping from EN corpus to FR corpus by multiple en_embedding by the linear mappping matrix 
"""

en_embeddings = matrx_en_es.dot(en_embeddings.transpose()).transpose()
# en mapping to fr
# src_embeddings = matrx_es_fr.dot(src_embeddings.transpose()).transpose()

In [29]:
# printing nearest neighbors in the source space
#test with source en after mapping
src_word = 'dog_noun'
get_nn(src_word, en_embeddings, en_id2word, en_embeddings, en_id2word, K=5)

Nearest neighbors of "dog_noun":
1.0000 - dog_noun
0.6756 - cat_noun
0.6396 - puppy_noun
0.6345 - pet_noun
0.5783 - dog_properNoun


In [30]:
# printing nearest neighbors in the source space mapping to target space
#test with source es
src_word = 'dog_noun'
get_nn(src_word, en_embeddings, en_id2word, es_embeddings, es_id2word, K=5)

Nearest neighbors of "dog_noun":
0.6954 - perro_noun
0.6092 - cachorro_noun
0.5684 - gato_noun
0.5602 - perrito_noun
0.5453 - animal_noun


In [31]:
#test mapping en -> fr by multiple with es -> fr mapping matrix => W_en_fr = W_en_es * W_es_fr
en_embeddings = matrx_es_fr.dot(en_embeddings.transpose()).transpose()

In [68]:
src_word = 'pull_verb'
get_nn(src_word, en_embeddings, en_id2word, fr_embeddings, fr_id2word, K=5, pos='verb')

Nearest neighbors of "pull_verb":
0.5555 - lâcher_verb
0.4972 - attraper_verb
0.4876 - sauter_verb
0.4870 - balancer_verb
0.4810 - glisser_verb


('lâcher_verb', 0.5555055)

## 2 buidling dictionary from English to France using MUSE

In [33]:
!ls Dictionary_To_Infe/

to_infe_en_fr.csv  to_infe_fr_en.csv  to_infe_pt_en.csv
to_infe_en_pt.csv  to_infe_fr_pt.csv  to_infe_pt_fr.csv


In [14]:
import pandas as pd
en_df = pd.read_csv('./Dictionary_To_Infe/to_infe_en_fr.csv')
en_df['source'] = [str(x).replace(' ', '-') for x in en_df['source']]
en_word = (en_df['source']+'_'+en_df['POS']).values
print(len(en_word))

40130


In [12]:
en_corpus_word = set(list(en_word2id.keys()))
has_word = [x for x in en_word if x in en_corpus_word]
out_word = [x for x in en_word if x not in en_corpus_word]

In [15]:
print(len(has_word))
print(len(out_word))
24404 + 15726

24404
15726


40130

In [69]:
def get_pos(x):
    return x.split('_')[1]
def get_word(x):
    return x.split('_')[0].replace('-', ' ')

In [72]:
get_word('Juande-Ramos_properNoun')

'Juande Ramos'

In [None]:
with open('muse_trans_en-fr.tsv', 'a') as f:
    for x in has_word:
        src_word = x
        pos = get_pos(src_word)
        result, score = get_nn(src_word, en_embeddings, en_id2word, fr_embeddings, fr_id2word, K=10, print_mode=False, pos=pos)
        line = get_word(x)+'\t'+get_word(result)+'\t'+pos+'\t'+str(score)
        f.write("%s\n" % line)

In [17]:
has_word.index('ennode_nou')

['occupational_adjective',
 'scare_noun',
 'factory_noun',
 'trap_verb',
 'treasure_verb',
 'fellow_adjective',
 'methodical_adjective',
 'ocular_adjective',
 'biosphere_noun',
 'bay_noun']