In [192]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [202]:
def make_embedding_matrices(datapath):
    token_to_id = dict()
    id_to_token = dict()
    embeddings = []

    for row in tqdm(pd.read_csv(datapath, header=None).iterrows(), position=0):
        idx = row[0]
        token = row[1][0]
        vec = np.array(row[1][1:-1])

        token_to_id[token] = idx
        id_to_token[len(embeddings)] = token
        embeddings.append(vec)

    embeddings = np.array(embeddings, dtype=float)
    
    return token_to_id, id_to_token, embeddings

In [203]:
tokens_ru, id_to_token_ru, embeddings_ru = make_embedding_matrices('/data/LowResourceTmp/ru.csv')

20007it [00:03, 5544.91it/s]


In [204]:
tokens_en, id_to_token_en, embeddings_en = make_embedding_matrices('/data/LowResourceTmp/en.csv')

20009it [00:03, 5691.06it/s]


In [205]:
with open('en-ru-dict', 'rb') as f:
    en_ru_dict = pickle.loads(f.read())

In [207]:
def normaize(arr):
    return arr / np.sqrt((arr ** 2).sum(axis=1))[:, np.newaxis]

In [208]:
indices_en = []
indices_ru = []


for en_token, ru_token in en_ru_dict.items():
    try:
        en_idx = tokens_en[en_token]
        ru_idx = tokens_ru[ru_token]
    except KeyError:
        continue
        
    indices_en.append(en_idx)
    indices_ru.append(ru_idx)

indices_en = np.array(indices_en)
indices_ru = np.array(indices_ru)

In [210]:
embeddings_en = normaize(embeddings_en)
embeddings_ru = normaize(embeddings_ru)

In [211]:
x = embeddings_en[indices_en]
z = embeddings_ru[indices_ru]

In [212]:
u, s, vt = np.linalg.svd(z.T.dot(x))
w = vt.T.dot(u.T)

In [213]:
embeddings_en_new = embeddings_en.dot(w)

In [214]:
def get_closest_words(token, emb_en=embeddings_en_new, emb_ru=embeddings_ru, token2id=tokens_en, id2token=id_to_token_ru):
    vec = emb_en[token2id[token]]
    idx = sorted(np.arange(emb_ru.shape[0]), key=lambda i: np.linalg.norm(vec - emb_ru[i]))[:5]
    tokens = [id2token[ix] for ix in idx]
    
    return tokens

In [215]:
get_closest_words('hiker')

['турист', 'велосипедист', 'альпинист', 'путник', 'паломник']

In [216]:
get_closest_word('be')

['быт', 'произойт', 'возникнут', 'выглядет', 'имет']

In [217]:
get_closest_word('cinema')

['кинокартин', 'кинолент', 'перформанс', 'кинематограф', 'рок-музык']

In [229]:
shared_embeddings = {}

In [231]:
for token, ix in tokens_en.items():
    shared_embeddings[token] = embeddings_en_new[ix]

for token, ix in tokens_ru.items():
    shared_embeddings[token] = embeddings_ru[ix]

In [232]:
with open('shared_embeddings', 'wb') as f:
    f.write(pickle.dumps(shared_embeddings))