# Install Libraries

* TensorFlow 2.0 for GPU
* Transformers
* Annoy
* Sentence Transformers

In [0]:
!pip uninstall tensorflow



In [0]:
!pip install tensorflow-gpu==2.0.0
!pip install transformers
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.2.6.1)


In [0]:
!pip install annoy



# Import Libraries

In [0]:
from tqdm import tqdm_notebook
import networkx as nx       
from annoy import AnnoyIndex

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
import tqdm
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from sentence_transformers import models, SentenceTransformer

# Utility Functions

## Build Graph of Synonims

In [0]:
def build_graph(
    source_sentences : list,
    target_sentences : list
):
  G = nx.Graph()
  for current_index, one_source_sentence in tqdm_notebook(enumerate(source_sentences)):
    G.add_node(one_source_sentence)
    G.add_node(target_sentences[current_index])
    G.add_edge(one_source_sentence, target_sentences[current_index])
  print("We have {} pairs of sentences. Graph has {} components".format(
      len(source_sentences),
      nx.number_connected_components(G)
  ))
  return G

## Filter Sentence Length

In [0]:
MAX_LIMIT_OF_SEQ = 50

def filter_sent_length(source_sentences, target_sentences):
  """Remove sentences that have more than 50 tokens"""
  source_sentences_list = []
  target_sentences_list = []
  for source_sentence, target_sentence in zip(source_sentences, target_sentences):
    if len(source_sentence.split()) > MAX_LIMIT_OF_SEQ or len(target_sentence.split()) > MAX_LIMIT_OF_SEQ:
      continue
    else:
      source_sentences_list.append(source_sentence)
      target_sentences_list.append(target_sentence)
  return source_sentences_list, target_sentences_list

## Models with Different Pooling Strategies

In [0]:
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer('bert-base-multilingual-cased')

# Apply mean/max/cls pooling to get one fixed sized sentence vector
mean_pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                 pooling_mode_mean_tokens=True,
                                 pooling_mode_cls_token=False,
                                 pooling_mode_max_tokens=False)
cls_pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                 pooling_mode_mean_tokens=False,
                                 pooling_mode_cls_token=True,
                                 pooling_mode_max_tokens=False)
max_pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                 pooling_mode_mean_tokens=False,
                                 pooling_mode_cls_token=False,
                                 pooling_mode_max_tokens=True)

mean_model = SentenceTransformer(modules=[word_embedding_model, mean_pooling_model])
cls_model = SentenceTransformer(modules=[word_embedding_model, cls_pooling_model])
max_model = SentenceTransformer(modules=[word_embedding_model, max_pooling_model])

## Get Representations

In [0]:
def get_representations(list_of_sentences : list,
                        representation_type = 'cls'):

  if representation_type == 'mean':
    res_embeddings = mean_model.encode(list_of_sentences, 
                      batch_size = 128, 
                      show_progress_bar = True,
                      convert_to_numpy = True)
  elif representation_type == 'max':
    res_embeddings = max_model.encode(list_of_sentences, 
                      batch_size = 128, 
                      show_progress_bar = True,
                      convert_to_numpy = True)
  else:
    res_embeddings = cls_model.encode(list_of_sentences, 
        batch_size = 128, 
        show_progress_bar = True,
        convert_to_numpy = True)

  return res_embeddings

## Get Difference Vector

In [0]:
def get_diff_vector(embeddings_source, embeddings_target):
  """
  Obtains a list of vectors that represent differences between an embedding in 
  a source-language and a target-language embedding. 
  """
  difference_vector = (embeddings_target - embeddings_source)
  print('Shape of difference (TARGET-SOURCE) vector: {}'.format(difference_vector.shape))
  source_to_target_vector = np.mean(difference_vector, axis=0)
  print('Shift source -> target vector: {}'.format(source_to_target_vector.shape))
  return source_to_target_vector

## Build Annoy Index

In [0]:
def build_index(source_embeddings, target_embeddings):
  f = 768
  sourceIndex = AnnoyIndex(f, 'angular')  # Length of item vector to be indexed
  targetIndex = AnnoyIndex(f, 'angular')  # Length of item vector to be indexed

  for index, source_vector in enumerate(source_embeddings):
    sourceIndex.add_item(index, source_vector)
  sourceIndex.build(100)

  for index, target_vector in enumerate(target_embeddings):
    targetIndex.add_item(index, target_vector)
  targetIndex.build(100)
  return sourceIndex, targetIndex

## Calculate Accuracy

In [0]:
def calculate_accuracy_source2target(
    targetIndex,
    test_embeddings_4_source, 
    indices_test_positions,
    shift_vector,
    kG,
    source_sentences,
    target_sentences,
):
  test_size = len(indices_test_positions)
  top_1, top_3, top_5 = 0, 0, 0
  list_of_stat_results = []


  for current_index, source_test_vector in zip(indices_test_positions, test_embeddings_4_source):
    res = {}
    res['source'] = source_sentences[current_index]
    res['target'] = target_sentences[current_index]

    predicted_target_vector = source_test_vector + shift_vector
  
    # В ходе сдвига - согласно гипотезе - должны получить перевод
    list_of_nearest_positions, list_of_distances = targetIndex.get_nns_by_vector(predicted_target_vector, n=5, include_distances=True)
    
    for index, current_position in enumerate(list_of_nearest_positions):
      res['{} translate'.format(index+1)] = target_sentences[current_position]

    if current_index == list_of_nearest_positions[0]:
      top_1 += 1
      res['label'] = 'top-1'

    for current_position in list_of_nearest_positions[:3]:
      if target_sentences[current_position] in nx.node_connected_component(kG, source_sentences[current_index]):
        top_3 += 1
        if 'label' not in res: 
          res['label'] = 'top-3'
        break

    for current_position in list_of_nearest_positions:
      if target_sentences[current_position] in nx.node_connected_component(kG, source_sentences[current_index]):
        top_5 += 1
        if 'label' not in res: 
          res['label'] = 'top-5'
        break
    list_of_stat_results.append(res)
    
  top_1 = top_1 / test_size
  top_3 = top_3 / test_size
  top_5 = top_5 / test_size

  accuracy_stat = {}
  accuracy_stat['top-1'] = top_1
  accuracy_stat['top-3'] = top_3
  accuracy_stat['top-5'] = top_5

  return accuracy_stat, pd.DataFrame(list_of_stat_results)

## Get Results

In [0]:
import time

In [0]:
def get_results(original_source_sentences, original_target_sentences,
                source_language, target_language, representation_type = 'cls'):
  start_time = time.time()
  source_sentences, target_sentences = filter_sent_length(original_source_sentences, 
                                                          original_target_sentences)


  kG = build_graph(source_sentences, target_sentences)
  source_embeddings = get_representations(source_sentences, representation_type)
  target_embeddings = get_representations(target_sentences, representation_type)

  indices_train, indices_test = train_test_split(
      np.arange(len(source_sentences)),
      random_state=42, 
      test_size=0.5)
    
  source_index, target_index = build_index(
        np.array(source_embeddings),
        np.array(target_embeddings)
        )

  train_source_embeddings = np.array(source_embeddings)[indices_train]
  train_target_embeddings = np.array(target_embeddings)[indices_train]
    
  test_source_embeddings = np.array(source_embeddings)[indices_test]
  test_target_embeddings = np.array(target_embeddings)[indices_test]

  shift_vector = get_diff_vector(train_source_embeddings, train_target_embeddings)
  accuracy_dict_source2target, resDF_source2target  = calculate_accuracy_source2target(
        target_index,
        test_source_embeddings,
        indices_test,
        shift_vector,
        kG,
        source_sentences, 
        target_sentences)
  print('{} = {} -> {} : {}'.format(representation_type, source_language, target_language, accuracy_dict_source2target))
  accuracy_dict_target2source, resDF_target2source  = calculate_accuracy_source2target(
        source_index,
        test_target_embeddings,
        indices_test,
        -shift_vector,
        kG,
        target_sentences,
        source_sentences 
        )
  print('{} = {} -> {} : {}'.format(representation_type, target_language,source_language, accuracy_dict_target2source))
  print("--- %s seconds ---" % (time.time() - start_time))
  return accuracy_dict_source2target, resDF_source2target, accuracy_dict_target2source, resDF_target2source

# Download Datasets

## Download & Unpack

In [0]:
# Download & unzip datasets
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-ru.txt.zip
!unzip -o en-ru.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-es.txt.zip
!unzip -o en-es.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-fr.txt.zip
!unzip -o en-fr.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-it.txt.zip
!unzip -o en-it.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/de-en.txt.zip
!unzip -o de-en.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-fi.txt.zip
!unzip -o en-fi.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-ja.txt.zip
!unzip -o en-ja.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-hi.txt.zip
!unzip -o en-hi.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-tr.txt.zip
!unzip -o en-tr.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-vi.txt.zip
!unzip -o en-vi.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-lt.txt.zip
!unzip -o en-lt.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-he.txt.zip
!unzip -o en-he.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/ar-en.txt.zip
!unzip -o ar-en.txt.zip -d ./tatoeba/
!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-is.txt.zip
!unzip -o en-is.txt.zip -d ./tatoeba/


!unzip -o en-ru.txt.zip
!unzip -o en-es.txt.zip
!unzip -o en-fr.txt.zip
!unzip -o en-it.txt.zip
!unzip -o de-en.txt.zip
!unzip -o en-fi.txt.zip
!unzip -o en-ja.txt.zip
!unzip -o en-hi.txt.zip
!unzip -o en-tr.txt.zip
!unzip -o en-vi.txt.zip
!unzip -o en-lt.txt.zip
!unzip -o en-he.txt.zip
!unzip -o ar-en.txt.zip
!unzip -o en-is.txt.zip

--2020-05-17 12:23:31--  https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-ru.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14609714 (14M) [application/zip]
Saving to: ‘en-ru.txt.zip.1’


2020-05-17 12:23:36 (4.36 MB/s) - ‘en-ru.txt.zip.1’ saved [14609714/14609714]

Archive:  en-ru.txt.zip
  inflating: ./tatoeba/README        
  inflating: ./tatoeba/LICENSE       
  inflating: ./tatoeba/Tatoeba.en-ru.en  
  inflating: ./tatoeba/Tatoeba.en-ru.ru  
  inflating: ./tatoeba/Tatoeba.en-ru.xml  
--2020-05-17 12:23:39--  https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-es.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200

## Save to List

In [0]:
f = open('Tatoeba.en-ru.en', 'r')
enru_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-es.en', 'r')
enes_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-fr.en', 'r')
enfr_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-it.en', 'r')
enit_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.de-en.en', 'r')
deen_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-fi.en', 'r')
enfi_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-ja.en', 'r')
enja_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-hi.en', 'r')
enhi_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-tr.en', 'r')
entr_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-vi.en', 'r')
envi_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-lt.en', 'r')
enlt_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-he.en', 'r')
enhe_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.ar-en.en', 'r')
aren_sentences_en = f.read().splitlines()
f.close()
f = open('Tatoeba.en-is.en', 'r')
enis_sentences_en = f.read().splitlines()
f.close()

f = open('Tatoeba.en-ru.ru', 'r')
sentences_ru = f.read().splitlines()
f.close()
f = open('Tatoeba.en-es.es', 'r')
sentences_es = f.read().splitlines()
f.close()
f = open('Tatoeba.en-fr.fr', 'r')
sentences_fr = f.read().splitlines()
f.close()
f = open('Tatoeba.en-it.it', 'r')
sentences_it = f.read().splitlines()
f.close()
f = open('Tatoeba.de-en.de', 'r')
sentences_de = f.read().splitlines()
f.close()
f = open('Tatoeba.en-fi.fi', 'r')
sentences_fi = f.read().splitlines()
f.close()
f = open('Tatoeba.en-ja.ja', 'r')
sentences_ja = f.read().splitlines()
f.close()
f = open('Tatoeba.en-hi.hi', 'r')
sentences_hi = f.read().splitlines()
f.close()
f = open('Tatoeba.en-tr.tr', 'r')
sentences_tr = f.read().splitlines()
f.close()
f = open('Tatoeba.en-vi.vi', 'r')
sentences_vi = f.read().splitlines()
f.close()
f = open('Tatoeba.en-lt.lt', 'r')
sentences_lt = f.read().splitlines()
f.close()
f = open('Tatoeba.en-he.he', 'r')
sentences_he = f.read().splitlines()
f.close()
f = open('Tatoeba.ar-en.ar', 'r')
sentences_ar = f.read().splitlines()
f.close()
f = open('Tatoeba.en-is.is', 'r')
sentences_is = f.read().splitlines()
f.close()

# Results for Language Pairs

## English - Russian

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'ru'

### CLS Pooling

In [0]:
representation_type = 'CLS'
cls_accuracy_dict_en2ru, resDF_source2target, cls_accuracy_dict_ru2en, resDF_target2source = get_results(enru_sentences_en[:LIMIT], sentences_ru[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 20000 pairs of sentences. Graph has 13886 components


Batches: 100%|██████████| 157/157 [00:37<00:00,  4.24it/s]
Batches: 100%|██████████| 157/157 [00:43<00:00,  3.60it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
CLS = en -> ru : {'top-1': 0.1282, 'top-3': 0.2326, 'top-5': 0.2554}
CLS = ru -> en : {'top-1': 0.0635, 'top-3': 0.1112, 'top-5': 0.1258}


In [0]:
print('Representation: {}'.format(representation_type))
display('{} -> {}'.format(sourceLanguage, targetLanguage))
display(accuracy_dict_source2target)
display(resDF_source2target.head(10))

display('{} -> {}'.format(targetLanguage, sourceLanguage ))
display(accuracy_dict_target2source)
display(resDF_target2source.head(10))

Representation: CLS


'en -> ru'

NameError: ignored

### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2ru, resDF_source2target, max_accuracy_dict_ru2en, resDF_target2source = get_results(enru_sentences_en[:LIMIT], sentences_ru[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 20000 pairs of sentences. Graph has 13886 components


Batches: 100%|██████████| 157/157 [00:36<00:00,  4.24it/s]
Batches: 100%|██████████| 157/157 [00:43<00:00,  3.61it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
max = en -> ru : {'top-1': 0.1712, 'top-3': 0.2984, 'top-5': 0.3317}
max = ru -> en : {'top-1': 0.1254, 'top-3': 0.2034, 'top-5': 0.2263}


In [0]:
print('Representation: {}'.format(representation_type))
display('{} -> {}'.format(sourceLanguage, targetLanguage))
display(accuracy_dict_source2target)
display(resDF_source2target.head(10))

display('{} -> {}'.format(targetLanguage, sourceLanguage ))
display(accuracy_dict_target2source)
display(resDF_target2source.head(10))

Representation: max


'en -> ru'

NameError: ignored

### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2ru, resDF_source2target, mean_accuracy_dict_ru2en, resDF_target2source = get_results(enru_sentences_en[:LIMIT], sentences_ru[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 19.09it/s]


We have 20000 pairs of sentences. Graph has 13886 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 12.27it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.33it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
mean = en -> ru : {'top-1': 0.2346, 'top-3': 0.4106, 'top-5': 0.4449}
mean = ru -> en : {'top-1': 0.15, 'top-3': 0.2554, 'top-5': 0.2818}


In [0]:
print('Representation: {}'.format(representation_type))
display('{} -> {}'.format(sourceLanguage, targetLanguage))
display(accuracy_dict_source2target)
display(resDF_source2target.head(10))

display('{} -> {}'.format(targetLanguage, sourceLanguage ))
display(accuracy_dict_target2source)
display(resDF_target2source.head(10))

Unnamed: 0,source,target,1 translate,2 translate,3 translate,4 translate,5 translate,label
0,I can't live like this.,Я так жить не могу.,"Я думаю, вам лучше отдохнуть.","Я люблю тебя больше, чем ты меня.","Я не могу подумать о том, чтобы жить без тебя.","Извините, я не могу остаться надолго.",Я ничего от тебя не жду.,
1,You should have told me a long time ago.,Тебе давно надо было мне об этом сказать.,"Я хочу, чтобы ты поехала.","Я подумал над тем, что ты мне сказал.","То, что ты сказал, удивило меня.",Твоё письмо обрадовало меня.,Вам давно надо было мне об этом сказать.,top-5
2,Are you going to sing here?,Ты собираешься здесь петь?,Вы собираетесь здесь петь?,Ты собираешься здесь петь?,Ты обращаешься ко мне?,Вы звонили мне вчера вечером?,Ты скучал по мне?,top-3
3,I can't wait to go on a vacation.,Не могу дождаться отпуска.,Можешь поехать туда.,"Ты должен много заниматься, чтобы догнать свой...",Не хочу идти в школу.,"Не знаю, что и сказать.","Не знаю, что и сказать.",
4,Creationism is a pseudo-science.,Креационизм — это псевдонаука.,Невинность - это прекрасная штука.,Дальше размышлять не имеет смысла.,"Математика — она как любовь: идея-то проста, н...",Образование не ограничивается одним лишь изуче...,"Есть вещи в этом мире, которые просто не могут...",
5,For some reason I feel more alive at night.,Почему-то по ночам я бодрее.,"Я жив, даже если не подаю никаких признаков жи...","Тебе нужно лучше следить за тем, что ты говоришь.",Меня никто не понимает.,Тебе лучше бы остаться сегодня в постели.,"Я хочу знать, будете ли вы завтра свободны.",
6,Thanks to you I've lost my appetite.,Я из-за тебя аппетит потерял.,"Я хочу знать, будешь ли ты завтра свободен.","Я хочу знать, будете ли Вы завтра свободны.",Хочу вновь с вами увидеться.,Хочу вновь увидеть тебя.,"Ты встаёшь не так рано, как твоя сестра.",
7,All you have to do is wash the dishes.,"Всё, что ты должен сделать, - помыть посуду.",Тебе нужно бросить курить.,Тебе нужно бросить курить.,"Прежде чем купить обувь, примерь её.","Как готовишь свою постель, так и ложись в неё.",Тебе стоит бросить пить.,
8,You look very tired.,Ты выглядишь очень уставшей.,Ты выглядишь очень бледным.,Ты выглядишь очень усталой.,"Я думаю, что ты неправ.",Меня никто не понимает.,"Ты достаточно взрослый, чтобы знать это.",top-3
9,I have to take medicine.,Я должен принимать лекарства.,Я из-за тебя аппетит потерял.,Тебе надо бы бросить курить.,Я из-за вас аппетит потерял.,Я не мог позвонить тебе - телефон не работал.,"Если я найду ваш паспорт, то немедленно позвоню.",


## English - Spanish

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'es'
graph = build_graph(enes_sentences_en[:LIMIT], sentences_es[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2es, resDF_source2target, cls_accuracy_dict_es2en, resDF_target2source = get_results(enes_sentences_en[:LIMIT], sentences_es[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 16235 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 20000 pairs of sentences. Graph has 16235 components


Batches: 100%|██████████| 157/157 [00:38<00:00,  4.06it/s]
Batches: 100%|██████████| 157/157 [00:40<00:00,  3.86it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
CLS = en -> es : {'top-1': 0.2302, 'top-3': 0.3278, 'top-5': 0.349}
CLS = es -> en : {'top-1': 0.0574, 'top-3': 0.0879, 'top-5': 0.0972}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2es, resDF_source2target, max_accuracy_dict_es2en, resDF_target2source = get_results(enes_sentences_en[:LIMIT], sentences_es[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 20000 pairs of sentences. Graph has 16235 components


Batches: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]
Batches: 100%|██████████| 157/157 [00:40<00:00,  3.86it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
max = en -> es : {'top-1': 0.3033, 'top-3': 0.4244, 'top-5': 0.4455}
max = es -> en : {'top-1': 0.2074, 'top-3': 0.2789, 'top-5': 0.2936}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2es, resDF_source2target, mean_accuracy_dict_es2en, resDF_target2source = get_results(enes_sentences_en[:LIMIT], sentences_es[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 20000 pairs of sentences. Graph has 16235 components


Batches: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]
Batches: 100%|██████████| 157/157 [00:40<00:00,  3.88it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
mean = en -> ru : {'top-1': 0.364, 'top-3': 0.4991, 'top-5': 0.52}
mean = ru -> en : {'top-1': 0.2236, 'top-3': 0.312, 'top-5': 0.3315}


## English - French

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'fr'
graph = build_graph(enfr_sentences_en[:LIMIT], sentences_fr[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2fr, resDF_source2target, cls_accuracy_dict_fr2en, resDF_target2source = get_results(enfr_sentences_en[:LIMIT], sentences_fr[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 15434 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.47it/s]


We have 19999 pairs of sentences. Graph has 15433 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.56it/s]
Batches: 100%|██████████| 157/157 [00:16<00:00,  9.72it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
CLS = en -> fr : {'top-1': 0.1992, 'top-3': 0.2871, 'top-5': 0.308}
CLS = fr -> en : {'top-1': 0.0873, 'top-3': 0.1511, 'top-5': 0.1663}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2fr, resDF_source2target, max_accuracy_dict_fr2en, resDF_target2source = get_results(enfr_sentences_en[:LIMIT], sentences_fr[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.74it/s]


We have 19999 pairs of sentences. Graph has 15433 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.39it/s]
Batches: 100%|██████████| 157/157 [00:16<00:00,  9.55it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
max = en -> fr : {'top-1': 0.2677, 'top-3': 0.3825, 'top-5': 0.4039}
max = fr -> en : {'top-1': 0.2185, 'top-3': 0.3224, 'top-5': 0.346}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2fr, resDF_source2target, mean_accuracy_dict_fr2en, resDF_target2source = get_results(enfr_sentences_en[:LIMIT], sentences_fr[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 19.34it/s]


We have 19999 pairs of sentences. Graph has 15433 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.46it/s]
Batches: 100%|██████████| 157/157 [00:16<00:00,  9.69it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
mean = en -> fr : {'top-1': 0.2985, 'top-3': 0.4375, 'top-5': 0.4663}
mean = fr -> en : {'top-1': 0.2337, 'top-3': 0.3561, 'top-5': 0.379}


## English - Italian

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'it'
graph = build_graph(enit_sentences_en[:LIMIT], sentences_it[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2it, resDF_source2target, cls_accuracy_dict_it2en, resDF_target2source = get_results(enit_sentences_en[:LIMIT], sentences_it[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 8555 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 19999 pairs of sentences. Graph has 8554 components


Batches: 100%|██████████| 157/157 [00:35<00:00,  4.49it/s]
Batches: 100%|██████████| 157/157 [00:37<00:00,  4.16it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
CLS = en -> it : {'top-1': 0.1079, 'top-3': 0.2953, 'top-5': 0.3204}
CLS = it -> en : {'top-1': 0.024, 'top-3': 0.0613, 'top-5': 0.0692}
--- 99.99169158935547 seconds ---


In [0]:
print('Representation: {}'.format(representation_type))
display('{} -> {}'.format(sourceLanguage, targetLanguage))
display(cls_accuracy_dict_en2it)
display(resDF_source2target.head(10))

display('{} -> {}'.format(targetLanguage, sourceLanguage ))
display(cls_accuracy_dict_it2en)
display(resDF_target2source.head(10))

Representation: CLS


'en -> it'

NameError: ignored

### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2it, resDF_source2target, max_accuracy_dict_it2en, resDF_target2source = get_results(enit_sentences_en[:LIMIT], sentences_it[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:07, 19.51it/s]


We have 19999 pairs of sentences. Graph has 8554 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 12.76it/s]
Batches: 100%|██████████| 157/157 [00:13<00:00, 11.93it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
max = en -> de : {'top-1': 0.1446, 'top-3': 0.3998, 'top-5': 0.4272}
max = de -> en : {'top-1': 0.0777, 'top-3': 0.1843, 'top-5': 0.1975}


In [0]:
print('Representation: {}'.format(representation_type))
display('{} -> {}'.format(sourceLanguage, targetLanguage))
display(accuracy_dict_source2target)
display(resDF_source2target.head(10))

display('{} -> {}'.format(targetLanguage, sourceLanguage ))
display(accuracy_dict_target2source)
display(resDF_target2source.head(10))

### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2it, resDF_source2target, mean_accuracy_dict_it2en, resDF_target2source = get_results(enit_sentences_en[:LIMIT], sentences_it[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 19999 pairs of sentences. Graph has 8554 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 13.01it/s]
Batches: 100%|██████████| 157/157 [00:13<00:00, 12.07it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
mean = en -> de : {'top-1': 0.1789, 'top-3': 0.4731, 'top-5': 0.5073}
mean = de -> en : {'top-1': 0.0845, 'top-3': 0.2223, 'top-5': 0.2376}


## English - German

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'de'
graph = build_graph(deen_sentences_en[:LIMIT], sentences_de[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2de, resDF_source2target, cls_accuracy_dict_de2en, resDF_target2source = get_results(deen_sentences_en[:LIMIT], sentences_de[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 14742 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 19.15it/s]


We have 19999 pairs of sentences. Graph has 14741 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.65it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.45it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
CLS = en -> de : {'top-1': 0.1604, 'top-3': 0.2518, 'top-5': 0.2716}
CLS = de -> en : {'top-1': 0.0495, 'top-3': 0.0823, 'top-5': 0.0935}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2de, resDF_source2target, max_accuracy_dict_de2en, resDF_target2source = get_results(deen_sentences_en[:LIMIT], sentences_de[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.70it/s]


We have 19999 pairs of sentences. Graph has 14741 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.57it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.33it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
max = en -> de : {'top-1': 0.2354, 'top-3': 0.3735, 'top-5': 0.3986}
max = de -> en : {'top-1': 0.1688, 'top-3': 0.244, 'top-5': 0.2609}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2de, resDF_source2target, mean_accuracy_dict_de2en, resDF_target2source = get_results(deen_sentences_en[:LIMIT], sentences_de[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 19.21it/s]


We have 19999 pairs of sentences. Graph has 14741 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.71it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.41it/s]


Shape of difference (TARGET-SOURCE) vector: (9999, 768)
Shift source -> target vector: (768,)
mean = en -> de : {'top-1': 0.2892, 'top-3': 0.4482, 'top-5': 0.4766}
mean = de -> en : {'top-1': 0.1878, 'top-3': 0.2905, 'top-5': 0.3114}


## English - Finnish

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'fi'
graph = build_graph(enfi_sentences_en[:LIMIT], sentences_fi[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2fi, resDF_source2target, cls_accuracy_dict_fi2en, resDF_target2source = get_results(enfi_sentences_en[:LIMIT], sentences_fi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 13847 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Batches:   1%|▏         | 2/157 [00:00<00:07, 19.79it/s]

We have 19994 pairs of sentences. Graph has 13841 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 12.14it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.34it/s]


Shape of difference (TARGET-SOURCE) vector: (9997, 768)
Shift source -> target vector: (768,)
CLS = en -> fi : {'top-1': 0.06872061618485546, 'top-3': 0.15194558367510252, 'top-5': 0.1736520956286886}
CLS = fi -> en : {'top-1': 0.008002400720216065, 'top-3': 0.01610483144943483, 'top-5': 0.01980594178253476}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2fi, resDF_source2target, max_accuracy_dict_fi2en, resDF_target2source = get_results(enfi_sentences_en[:LIMIT], sentences_fi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 19994 pairs of sentences. Graph has 13841 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 12.03it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.26it/s]


Shape of difference (TARGET-SOURCE) vector: (9997, 768)
Shift source -> target vector: (768,)
max = en -> fi : {'top-1': 0.07942382714814444, 'top-3': 0.1604481344403321, 'top-5': 0.18135440632189656}
max = fi -> en : {'top-1': 0.027908372511753526, 'top-3': 0.05511653496048815, 'top-5': 0.06481944583375013}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2fi, resDF_source2target, mean_accuracy_dict_fi2en, resDF_target2source = get_results(enfi_sentences_en[:LIMIT], sentences_fi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 19994 pairs of sentences. Graph has 13841 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 12.17it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00, 10.33it/s]


Shape of difference (TARGET-SOURCE) vector: (9997, 768)
Shift source -> target vector: (768,)
mean = en -> fi : {'top-1': 0.12243673101930579, 'top-3': 0.2546764029208763, 'top-5': 0.2770831249374812}
mean = fi -> en : {'top-1': 0.04481344403320996, 'top-3': 0.08262478743623088, 'top-5': 0.09682904871461438}


## English - Japanese

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'ja'
graph = build_graph(enja_sentences_en[:LIMIT], sentences_ja[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2ja, resDF_source2target, cls_accuracy_dict_ja2en, resDF_target2source = get_results(enja_sentences_en[:LIMIT], sentences_ja[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 16178 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.13it/s]


We have 20000 pairs of sentences. Graph has 16178 components


Batches: 100%|██████████| 157/157 [00:14<00:00, 10.87it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.92it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
CLS = en -> ja : {'top-1': 0.0641, 'top-3': 0.1105, 'top-5': 0.1286}
CLS = ja -> en : {'top-1': 0.0318, 'top-3': 0.0585, 'top-5': 0.0683}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2ja, resDF_source2target, max_accuracy_dict_ja2en, resDF_target2source = get_results(enja_sentences_en[:LIMIT], sentences_ja[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.57it/s]


We have 20000 pairs of sentences. Graph has 16178 components


Batches: 100%|██████████| 157/157 [00:14<00:00, 10.70it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.83it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
max = en -> ja : {'top-1': 0.092, 'top-3': 0.1609, 'top-5': 0.1823}
max = ja -> en : {'top-1': 0.0591, 'top-3': 0.1038, 'top-5': 0.1203}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2ja, resDF_source2target, mean_accuracy_dict_ja2en, resDF_target2source = get_results(enja_sentences_en[:LIMIT], sentences_ja[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.75it/s]


We have 20000 pairs of sentences. Graph has 16178 components


Batches: 100%|██████████| 157/157 [00:14<00:00, 10.90it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.87it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
mean = en -> de : {'top-1': 0.1195, 'top-3': 0.2068, 'top-5': 0.2331}
mean = de -> en : {'top-1': 0.0814, 'top-3': 0.1391, 'top-5': 0.1585}


## English - Hindi

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'hi'
graph = build_graph(enhi_sentences_en[:LIMIT], sentences_hi[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2hi, resDF_source2target, cls_accuracy_dict_hi2en, resDF_target2source = get_results(enhi_sentences_en[:LIMIT], sentences_hi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 10815 pairs of sentences. Graph has 8156 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/85 [00:00<?, ?it/s]


We have 10814 pairs of sentences. Graph has 8155 components


Batches: 100%|██████████| 85/85 [00:06<00:00, 13.48it/s]
Batches: 100%|██████████| 85/85 [00:08<00:00,  9.62it/s]


Shape of difference (TARGET-SOURCE) vector: (5407, 768)
Shift source -> target vector: (768,)
CLS = en -> hi : {'top-1': 0.07989643055298687, 'top-3': 0.15387460699093766, 'top-5': 0.1803218050675051}
CLS = hi -> en : {'top-1': 0.01424079896430553, 'top-3': 0.03495468836693175, 'top-5': 0.04142777880525245}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2hi, resDF_source2target, max_accuracy_dict_hi2en, resDF_target2source = get_results(enhi_sentences_en[:LIMIT], sentences_hi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   2%|▏         | 2/85 [00:00<00:04, 19.86it/s]


We have 10814 pairs of sentences. Graph has 8155 components


Batches: 100%|██████████| 85/85 [00:06<00:00, 13.40it/s]
Batches: 100%|██████████| 85/85 [00:08<00:00,  9.45it/s]


Shape of difference (TARGET-SOURCE) vector: (5407, 768)
Shift source -> target vector: (768,)
max = en -> hi : {'top-1': 0.0948770112816719, 'top-3': 0.18161642315516924, 'top-5': 0.21361198446458296}
max = hi -> en : {'top-1': 0.04808581468466802, 'top-3': 0.09117810245977437, 'top-5': 0.11004253745145182}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2hi, resDF_source2target, mean_accuracy_dict_hi2en, resDF_target2source = get_results(enhi_sentences_en[:LIMIT], sentences_hi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 10814 pairs of sentences. Graph has 8155 components


Batches: 100%|██████████| 85/85 [00:06<00:00, 13.48it/s]
Batches: 100%|██████████| 85/85 [00:08<00:00,  9.58it/s]


Shape of difference (TARGET-SOURCE) vector: (5407, 768)
Shift source -> target vector: (768,)
mean = en -> hi : {'top-1': 0.14388755317181431, 'top-3': 0.2620676900314407, 'top-5': 0.30479008692435733}
mean = hi -> en : {'top-1': 0.07064915849824302, 'top-3': 0.12594784538561124, 'top-5': 0.15073053449232476}


## English - Turkish

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'tr'
graph = build_graph(entr_sentences_en[:LIMIT], sentences_tr[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2tr, resDF_source2target, cls_accuracy_dict_tr2en, resDF_target2source = get_results(entr_sentences_en[:LIMIT], sentences_tr[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 18457 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.77it/s]


We have 20000 pairs of sentences. Graph has 18457 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.43it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.94it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
CLS = en -> tr : {'top-1': 0.0806, 'top-3': 0.1258, 'top-5': 0.1414}
CLS = tr -> en : {'top-1': 0.0099, 'top-3': 0.0178, 'top-5': 0.0238}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2tr, resDF_source2target, max_accuracy_dict_tr2en, resDF_target2source = get_results(entr_sentences_en[:LIMIT], sentences_tr[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 17.70it/s]


We have 20000 pairs of sentences. Graph has 18457 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.32it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.90it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
max = en -> tr : {'top-1': 0.0766, 'top-3': 0.1166, 'top-5': 0.1342}
max = tr -> en : {'top-1': 0.0365, 'top-3': 0.0565, 'top-5': 0.0662}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2tr, resDF_source2target, mean_accuracy_dict_tr2en, resDF_target2source = get_results(entr_sentences_en[:LIMIT], sentences_tr[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 18.51it/s]


We have 20000 pairs of sentences. Graph has 18457 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.45it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.99it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
mean = en -> tr : {'top-1': 0.1339, 'top-3': 0.1947, 'top-5': 0.2191}
mean = tr -> en : {'top-1': 0.0524, 'top-3': 0.0815, 'top-5': 0.0944}


## English - Vietnamese

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'vi'
graph = build_graph(envi_sentences_en[:LIMIT], sentences_vi[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2vi, resDF_source2target, cls_accuracy_dict_vi2en, resDF_target2source = get_results(envi_sentences_en[:LIMIT], sentences_vi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 5318 pairs of sentences. Graph has 4677 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   5%|▍         | 2/42 [00:00<00:02, 17.75it/s]


We have 5315 pairs of sentences. Graph has 4675 components


Batches: 100%|██████████| 42/42 [00:03<00:00, 11.50it/s]
Batches: 100%|██████████| 42/42 [00:04<00:00, 10.27it/s]


Shape of difference (TARGET-SOURCE) vector: (2657, 768)
Shift source -> target vector: (768,)
CLS = en -> vi : {'top-1': 0.2287434161023326, 'top-3': 0.33559066967644846, 'top-5': 0.3619262603461249}
CLS = vi -> en : {'top-1': 0.12227238525206922, 'top-3': 0.17832957110609482, 'top-5': 0.19563581640331076}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2vi, resDF_source2target, max_accuracy_dict_vi2en, resDF_target2source = get_results(envi_sentences_en[:LIMIT], sentences_vi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/42 [00:00<?, ?it/s]


We have 5315 pairs of sentences. Graph has 4675 components


Batches: 100%|██████████| 42/42 [00:03<00:00, 11.55it/s]
Batches: 100%|██████████| 42/42 [00:04<00:00, 10.31it/s]


Shape of difference (TARGET-SOURCE) vector: (2657, 768)
Shift source -> target vector: (768,)
max = en -> vi : {'top-1': 0.31715575620767494, 'top-3': 0.45560571858540255, 'top-5': 0.4932279909706546}
max = vi -> en : {'top-1': 0.20842738901429647, 'top-3': 0.29533483822422874, 'top-5': 0.3258088788562829}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2vi, resDF_source2target, mean_accuracy_dict_vi2en, resDF_target2source = get_results(envi_sentences_en[:LIMIT], sentences_vi[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/42 [00:00<?, ?it/s]


We have 5315 pairs of sentences. Graph has 4675 components


Batches: 100%|██████████| 42/42 [00:03<00:00, 11.67it/s]
Batches: 100%|██████████| 42/42 [00:04<00:00, 10.35it/s]


Shape of difference (TARGET-SOURCE) vector: (2657, 768)
Shift source -> target vector: (768,)
mean = en -> vi : {'top-1': 0.3694507148231753, 'top-3': 0.5105342362678705, 'top-5': 0.5500376222723853}
mean = vi -> en : {'top-1': 0.2656132430398796, 'top-3': 0.3623024830699774, 'top-5': 0.4006772009029345}


## English - Lithuanian

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'lt'
graph = build_graph(enlt_sentences_en[:LIMIT], sentences_lt[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2lt, resDF_source2target, cls_accuracy_dict_lt2en, resDF_target2source = get_results(enlt_sentences_en[:LIMIT], sentences_lt[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 8129 pairs of sentences. Graph has 7165 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/64 [00:00<?, ?it/s]


We have 8128 pairs of sentences. Graph has 7164 components


Batches: 100%|██████████| 64/64 [00:04<00:00, 12.88it/s]
Batches: 100%|██████████| 64/64 [00:05<00:00, 10.98it/s]


Shape of difference (TARGET-SOURCE) vector: (4064, 768)
Shift source -> target vector: (768,)
CLS = en -> lt : {'top-1': 0.07726377952755906, 'top-3': 0.12253937007874016, 'top-5': 0.14173228346456693}
CLS = lt -> en : {'top-1': 0.010088582677165354, 'top-3': 0.019192913385826772, 'top-5': 0.02485236220472441}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2lt, resDF_source2target, max_accuracy_dict_lt2en, resDF_target2source = get_results(enlt_sentences_en[:LIMIT], sentences_lt[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/64 [00:00<?, ?it/s]


We have 8128 pairs of sentences. Graph has 7164 components


Batches: 100%|██████████| 64/64 [00:05<00:00, 12.64it/s]
Batches: 100%|██████████| 64/64 [00:05<00:00, 10.87it/s]


Shape of difference (TARGET-SOURCE) vector: (4064, 768)
Shift source -> target vector: (768,)
max = en -> lt : {'top-1': 0.08513779527559055, 'top-3': 0.13065944881889763, 'top-5': 0.14714566929133857}
max = lt -> en : {'top-1': 0.028543307086614175, 'top-3': 0.05044291338582677, 'top-5': 0.0625}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2lt, resDF_source2target, mean_accuracy_dict_lt2en, resDF_target2source = get_results(enlt_sentences_en[:LIMIT], sentences_lt[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Batches:   3%|▎         | 2/64 [00:00<00:03, 16.90it/s]

We have 8128 pairs of sentences. Graph has 7164 components


Batches: 100%|██████████| 64/64 [00:05<00:00, 12.78it/s]
Batches: 100%|██████████| 64/64 [00:05<00:00, 10.97it/s]


Shape of difference (TARGET-SOURCE) vector: (4064, 768)
Shift source -> target vector: (768,)
mean = en -> lt : {'top-1': 0.12696850393700787, 'top-3': 0.19783464566929135, 'top-5': 0.2283464566929134}
mean = lt -> en : {'top-1': 0.048474409448818895, 'top-3': 0.08562992125984252, 'top-5': 0.09940944881889764}


## English - Hebrew

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'he'
graph = build_graph(enhe_sentences_en[:LIMIT], sentences_he[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2he, resDF_source2target, cls_accuracy_dict_he2en, resDF_target2source = get_results(enhe_sentences_en[:LIMIT], sentences_he[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 14721 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 20000 pairs of sentences. Graph has 14721 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 12.04it/s]
Batches: 100%|██████████| 157/157 [00:14<00:00, 10.68it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
CLS = en -> he : {'top-1': 0.0906, 'top-3': 0.1706, 'top-5': 0.1941}
CLS = he -> en : {'top-1': 0.0166, 'top-3': 0.031, 'top-5': 0.0367}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2he, resDF_source2target, max_accuracy_dict_he2en, resDF_target2source = get_results(enhe_sentences_en[:LIMIT], sentences_he[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 19.09it/s]


We have 20000 pairs of sentences. Graph has 14721 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.84it/s]
Batches: 100%|██████████| 157/157 [00:14<00:00, 10.62it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
max = en -> he : {'top-1': 0.1042, 'top-3': 0.1989, 'top-5': 0.2205}
max = he -> en : {'top-1': 0.0633, 'top-3': 0.102, 'top-5': 0.1187}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2he, resDF_source2target, mean_accuracy_dict_he2en, resDF_target2source = get_results(enhe_sentences_en[:LIMIT], sentences_he[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:08, 19.36it/s]


We have 20000 pairs of sentences. Graph has 14721 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.93it/s]
Batches: 100%|██████████| 157/157 [00:14<00:00, 10.63it/s]


Shape of difference (TARGET-SOURCE) vector: (10000, 768)
Shift source -> target vector: (768,)
mean = en -> he : {'top-1': 0.1579, 'top-3': 0.2798, 'top-5': 0.3129}
mean = he -> en : {'top-1': 0.0774, 'top-3': 0.1325, 'top-5': 0.1497}


## English - Icelandic

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'is'
graph = build_graph(enis_sentences_en[:LIMIT], sentences_is[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2is, resDF_source2target, cls_accuracy_dict_is2en, resDF_target2source = get_results(enis_sentences_en[:LIMIT], sentences_is[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 9436 pairs of sentences. Graph has 7934 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/74 [00:00<?, ?it/s]


We have 9436 pairs of sentences. Graph has 7934 components


Batches: 100%|██████████| 74/74 [00:06<00:00, 12.13it/s]
Batches: 100%|██████████| 74/74 [00:07<00:00,  9.47it/s]


Shape of difference (TARGET-SOURCE) vector: (4718, 768)
Shift source -> target vector: (768,)
CLS = en -> is : {'top-1': 0.0847816871555744, 'top-3': 0.14497668503603223, 'top-5': 0.16701992369648155}
CLS = is -> en : {'top-1': 0.011233573548113607, 'top-3': 0.020347604917337857, 'top-5': 0.024162780839338704}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2is, resDF_source2target, max_accuracy_dict_is2en, resDF_target2source = get_results(enis_sentences_en[:LIMIT], sentences_is[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/74 [00:00<?, ?it/s]


We have 9436 pairs of sentences. Graph has 7934 components


Batches: 100%|██████████| 74/74 [00:06<00:00, 11.83it/s]
Batches: 100%|██████████| 74/74 [00:07<00:00,  9.34it/s]


Shape of difference (TARGET-SOURCE) vector: (4718, 768)
Shift source -> target vector: (768,)
max = en -> is : {'top-1': 0.07799915218312845, 'top-3': 0.139041966935142, 'top-5': 0.1619330224671471}
max = is -> en : {'top-1': 0.03603221704111912, 'top-3': 0.06464603645612547, 'top-5': 0.07587961000423908}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2is, resDF_source2target, mean_accuracy_dict_is2en, resDF_target2source = get_results(enis_sentences_en[:LIMIT], sentences_is[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 9436 pairs of sentences. Graph has 7934 components


Batches: 100%|██████████| 74/74 [00:06<00:00, 12.07it/s]
Batches: 100%|██████████| 74/74 [00:07<00:00,  9.46it/s]


Shape of difference (TARGET-SOURCE) vector: (4718, 768)
Shift source -> target vector: (768,)
mean = en -> is : {'top-1': 0.12568885120813905, 'top-3': 0.21343789741415853, 'top-5': 0.2414158541754981}
mean = is -> en : {'top-1': 0.06252649427723611, 'top-3': 0.10640101738024586, 'top-5': 0.12462908011869436}


## English - Arabic

### CLS Pooling

In [0]:
LIMIT = 20000
sourceLanguage = 'en'
targetLanguage = 'ar'
graph = build_graph(aren_sentences_en[:LIMIT], sentences_ar[:LIMIT])

representation_type = 'CLS'
cls_accuracy_dict_en2ar, resDF_source2target, cls_accuracy_dict_ar2en, resDF_target2source = get_results(aren_sentences_en[:LIMIT], sentences_ar[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


We have 20000 pairs of sentences. Graph has 16970 components


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 19971 pairs of sentences. Graph has 16941 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 12.22it/s]
Batches: 100%|██████████| 157/157 [00:16<00:00,  9.81it/s]


Shape of difference (TARGET-SOURCE) vector: (9985, 768)
Shift source -> target vector: (768,)
CLS = en -> ar : {'top-1': 0.06408972561586221, 'top-3': 0.10454636491087523, 'top-5': 0.12016823552974164}
CLS = ar -> en : {'top-1': 0.009413178449829761, 'top-3': 0.020128179451231725, 'top-5': 0.025435609853795314}


### Max Pooling

In [0]:
representation_type = 'max'
max_accuracy_dict_en2ar, resDF_source2target, max_accuracy_dict_ar2en, resDF_target2source = get_results(aren_sentences_en[:LIMIT], sentences_ar[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   0%|          | 0/157 [00:00<?, ?it/s]


We have 19971 pairs of sentences. Graph has 16941 components


Batches: 100%|██████████| 157/157 [00:13<00:00, 12.02it/s]
Batches: 100%|██████████| 157/157 [00:16<00:00,  9.69it/s]


Shape of difference (TARGET-SOURCE) vector: (9985, 768)
Shift source -> target vector: (768,)
max = en -> ar : {'top-1': 0.06809533346685359, 'top-3': 0.11686360905267375, 'top-5': 0.13248547967154015}
max = ar -> en : {'top-1': 0.035349489284999, 'top-3': 0.060684958942519525, 'top-5': 0.0713999599439215}


### Mean Pooling

In [0]:
representation_type = 'mean'
mean_accuracy_dict_en2ar, resDF_source2target, mean_accuracy_dict_ar2en, resDF_target2source = get_results(aren_sentences_en[:LIMIT], sentences_ar[:LIMIT], sourceLanguage, targetLanguage, representation_type)

resDF_source2target.to_csv('{}--{}-{}.csv'.format(representation_type, sourceLanguage, targetLanguage))
resDF_target2source.to_csv('{}--{}-{}.csv'.format(representation_type, targetLanguage, sourceLanguage))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Batches:   1%|▏         | 2/157 [00:00<00:07, 19.87it/s]


We have 19971 pairs of sentences. Graph has 16941 components


Batches: 100%|██████████| 157/157 [00:12<00:00, 12.16it/s]
Batches: 100%|██████████| 157/157 [00:15<00:00,  9.83it/s]


Shape of difference (TARGET-SOURCE) vector: (9985, 768)
Shift source -> target vector: (768,)
mean = en -> ar : {'top-1': 0.11365912277188063, 'top-3': 0.18225515722010815, 'top-5': 0.2034848788303625}
mean = ar -> en : {'top-1': 0.05127178049268977, 'top-3': 0.08662126977768876, 'top-5': 0.10114159823753255}


# Comparison

## CLS Pooling: Source(En) to Target

In [0]:
comparison_s2t_dict = dict() 
comparison_s2t_dict['EN --> RU'] = cls_accuracy_dict_en2ru['top-1'], cls_accuracy_dict_en2ru['top-3'], cls_accuracy_dict_en2ru['top-5']
comparison_s2t_dict['EN --> ES'] = cls_accuracy_dict_en2es['top-1'], cls_accuracy_dict_en2es['top-3'], cls_accuracy_dict_en2es['top-5']
comparison_s2t_dict['EN --> FR'] = cls_accuracy_dict_en2fr['top-1'], cls_accuracy_dict_en2fr['top-3'], cls_accuracy_dict_en2fr['top-5']
comparison_s2t_dict['EN --> IT'] = cls_accuracy_dict_en2it['top-1'], cls_accuracy_dict_en2it['top-3'], cls_accuracy_dict_en2it['top-5']
comparison_s2t_dict['EN --> DE'] = cls_accuracy_dict_en2de['top-1'], cls_accuracy_dict_en2de['top-3'], cls_accuracy_dict_en2de['top-5']
comparison_s2t_dict['EN --> FI'] = cls_accuracy_dict_en2fi['top-1'], cls_accuracy_dict_en2fi['top-3'], cls_accuracy_dict_en2fi['top-5']
comparison_s2t_dict['EN --> JA'] = cls_accuracy_dict_en2ja['top-1'], cls_accuracy_dict_en2ja['top-3'], cls_accuracy_dict_en2ja['top-5']
comparison_s2t_dict['EN --> HI'] = cls_accuracy_dict_en2hi['top-1'], cls_accuracy_dict_en2hi['top-3'], cls_accuracy_dict_en2hi['top-5']
comparison_s2t_dict['EN --> TR'] = cls_accuracy_dict_en2tr['top-1'], cls_accuracy_dict_en2tr['top-3'], cls_accuracy_dict_en2tr['top-5']
comparison_s2t_dict['EN --> VI'] = cls_accuracy_dict_en2vi['top-1'], cls_accuracy_dict_en2vi['top-3'], cls_accuracy_dict_en2vi['top-5']
comparison_s2t_dict['EN --> LT'] = cls_accuracy_dict_en2lt['top-1'], cls_accuracy_dict_en2lt['top-3'], cls_accuracy_dict_en2lt['top-5']
comparison_s2t_dict['EN --> HE'] = cls_accuracy_dict_en2he['top-1'], cls_accuracy_dict_en2he['top-3'], cls_accuracy_dict_en2he['top-5']
comparison_s2t_dict['EN --> AR'] = cls_accuracy_dict_en2ar['top-1'], cls_accuracy_dict_en2ar['top-3'], cls_accuracy_dict_en2ar['top-5']
comparison_s2t_dict['EN --> IS'] = cls_accuracy_dict_en2is['top-1'], cls_accuracy_dict_en2is['top-3'], cls_accuracy_dict_en2is['top-5']
comparison_s2t_df = pd.DataFrame(comparison_s2t_dict, index=['TOP1', 'TOP2', 'TOP3'])
comparison_s2t_df = comparison_s2t_df.T
print("***********CLS Pooling: Source(En) to Target*************")
comparison_s2t_df

***********CLS Pooling: Source(En) to Target*************


Unnamed: 0,TOP1,TOP2,TOP3
EN --> RU,0.1274,0.2331,0.253
EN --> ES,0.2258,0.321,0.3435
EN --> FR,0.1992,0.2871,0.308
EN --> IT,0.1065,0.2923,0.3179
EN --> DE,0.1604,0.2518,0.2716
EN --> FI,0.068721,0.151946,0.173652
EN --> JA,0.1195,0.2068,0.2331
EN --> HI,0.079896,0.153875,0.180322
EN --> TR,0.0806,0.1258,0.1414
EN --> VI,0.228743,0.335591,0.361926


## CLS Pooling: Target to Source(En)

In [0]:
comparison_t2s_dict = dict() 
comparison_t2s_dict['RU --> EN'] = cls_accuracy_dict_ru2en['top-1'], cls_accuracy_dict_ru2en['top-3'], cls_accuracy_dict_ru2en['top-5']
comparison_t2s_dict['ES --> EN'] = cls_accuracy_dict_es2en['top-1'], cls_accuracy_dict_es2en['top-3'], cls_accuracy_dict_es2en['top-5']
comparison_t2s_dict['FR --> EN'] = cls_accuracy_dict_fr2en['top-1'], cls_accuracy_dict_fr2en['top-3'], cls_accuracy_dict_fr2en['top-5']
comparison_t2s_dict['IT --> EN'] = cls_accuracy_dict_it2en['top-1'], cls_accuracy_dict_it2en['top-3'], cls_accuracy_dict_it2en['top-5']
comparison_t2s_dict['DE --> EN'] = cls_accuracy_dict_de2en['top-1'], cls_accuracy_dict_de2en['top-3'], cls_accuracy_dict_de2en['top-5']
comparison_t2s_dict['FI --> EN'] = cls_accuracy_dict_fi2en['top-1'], cls_accuracy_dict_fi2en['top-3'], cls_accuracy_dict_fi2en['top-5']
comparison_t2s_dict['JA --> EN'] = cls_accuracy_dict_ja2en['top-1'], cls_accuracy_dict_ja2en['top-3'], cls_accuracy_dict_ja2en['top-5']
comparison_t2s_dict['HI --> EN'] = cls_accuracy_dict_hi2en['top-1'], cls_accuracy_dict_hi2en['top-3'], cls_accuracy_dict_hi2en['top-5']
comparison_t2s_dict['TR --> EN'] = cls_accuracy_dict_tr2en['top-1'], cls_accuracy_dict_tr2en['top-3'], cls_accuracy_dict_tr2en['top-5']
comparison_t2s_dict['VI --> EN'] = cls_accuracy_dict_vi2en['top-1'], cls_accuracy_dict_vi2en['top-3'], cls_accuracy_dict_vi2en['top-5']
comparison_t2s_dict['LT --> EN'] = cls_accuracy_dict_lt2en['top-1'], cls_accuracy_dict_lt2en['top-3'], cls_accuracy_dict_lt2en['top-5']
comparison_t2s_dict['HE --> EN'] = cls_accuracy_dict_he2en['top-1'], cls_accuracy_dict_he2en['top-3'], cls_accuracy_dict_he2en['top-5']
comparison_t2s_dict['AR --> EN'] = cls_accuracy_dict_ar2en['top-1'], cls_accuracy_dict_ar2en['top-3'], cls_accuracy_dict_ar2en['top-5']
comparison_t2s_dict['IS --> EN'] = cls_accuracy_dict_is2en['top-1'], cls_accuracy_dict_is2en['top-3'], cls_accuracy_dict_is2en['top-5']
comparison_t2s_df = pd.DataFrame(comparison_t2s_dict, index=['TOP1', 'TOP2', 'TOP3'])
comparison_t2s_df = comparison_t2s_df.T
print("***********CLS Pooling: Target to Source*************")
comparison_t2s_df

***********CLS Pooling: Target to Source*************


Unnamed: 0,TOP1,TOP2,TOP3
RU --> EN,0.0636,0.1113,0.1269
ES --> EN,0.057,0.0876,0.0972
FR --> EN,0.0873,0.1511,0.1663
IT --> EN,0.0226,0.0597,0.0668
DE --> EN,0.0495,0.0823,0.0935
FI --> EN,0.008002,0.016105,0.019806
JA --> EN,0.0814,0.1391,0.1585
HI --> EN,0.014241,0.034955,0.041428
TR --> EN,0.0099,0.0178,0.0238
VI --> EN,0.122272,0.17833,0.195636


## Max Pooling: Source(En) to Target

In [0]:
comparison_s2t_dict_max = dict() 
comparison_s2t_dict_max['EN --> RU'] = max_accuracy_dict_en2ru['top-1'], max_accuracy_dict_en2ru['top-3'], max_accuracy_dict_en2ru['top-5']
comparison_s2t_dict_max['EN --> ES'] = max_accuracy_dict_en2es['top-1'], max_accuracy_dict_en2es['top-3'], max_accuracy_dict_en2es['top-5']
comparison_s2t_dict_max['EN --> FR'] = max_accuracy_dict_en2fr['top-1'], max_accuracy_dict_en2fr['top-3'], max_accuracy_dict_en2fr['top-5']
comparison_s2t_dict_max['EN --> IT'] = max_accuracy_dict_en2it['top-1'], max_accuracy_dict_en2it['top-3'], max_accuracy_dict_en2it['top-5']
comparison_s2t_dict_max['EN --> DE'] = max_accuracy_dict_en2de['top-1'], max_accuracy_dict_en2de['top-3'], max_accuracy_dict_en2de['top-5']
comparison_s2t_dict_max['EN --> FI'] = max_accuracy_dict_en2fi['top-1'], max_accuracy_dict_en2fi['top-3'], max_accuracy_dict_en2fi['top-5']
comparison_s2t_dict_max['EN --> JA'] = max_accuracy_dict_en2ja['top-1'], max_accuracy_dict_en2ja['top-3'], max_accuracy_dict_en2ja['top-5']
comparison_s2t_dict_max['EN --> HI'] = max_accuracy_dict_en2hi['top-1'], max_accuracy_dict_en2hi['top-3'], max_accuracy_dict_en2hi['top-5']
comparison_s2t_dict_max['EN --> TR'] = max_accuracy_dict_en2tr['top-1'], max_accuracy_dict_en2tr['top-3'], max_accuracy_dict_en2tr['top-5']
comparison_s2t_dict_max['EN --> VI'] = max_accuracy_dict_en2vi['top-1'], max_accuracy_dict_en2vi['top-3'], max_accuracy_dict_en2vi['top-5']
comparison_s2t_dict_max['EN --> LT'] = max_accuracy_dict_en2lt['top-1'], max_accuracy_dict_en2lt['top-3'], max_accuracy_dict_en2lt['top-5']
comparison_s2t_dict_max['EN --> HE'] = max_accuracy_dict_en2he['top-1'], max_accuracy_dict_en2he['top-3'], max_accuracy_dict_en2he['top-5']
comparison_s2t_dict_max['EN --> AR'] = max_accuracy_dict_en2ar['top-1'], max_accuracy_dict_en2ar['top-3'], max_accuracy_dict_en2ar['top-5']
comparison_s2t_dict_max['EN --> IS'] = max_accuracy_dict_en2is['top-1'], max_accuracy_dict_en2is['top-3'], max_accuracy_dict_en2is['top-5']
comparison_s2t_df_max = pd.DataFrame(comparison_s2t_dict_max, index=['TOP1', 'TOP2', 'TOP3'])
comparison_s2t_df_max = comparison_s2t_df_max.T
print("***********Max Pooling: Source(En) to Target*************")
comparison_s2t_df_max

***********Max Pooling: Source(En) to Target*************


Unnamed: 0,TOP1,TOP2,TOP3
EN --> RU,0.1705,0.2977,0.3326
EN --> ES,0.3097,0.4342,0.4584
EN --> FR,0.2677,0.3825,0.4039
EN --> IT,0.1446,0.3998,0.4272
EN --> DE,0.2354,0.3735,0.3986
EN --> FI,0.079424,0.160448,0.181354
EN --> JA,0.092,0.1609,0.1823
EN --> HI,0.094877,0.181616,0.213612
EN --> TR,0.0766,0.1166,0.1342
EN --> VI,0.317156,0.455606,0.493228


## Max Pooling: Target to Source(En)

In [0]:
comparison_t2s_dict_max = dict() 
comparison_t2s_dict_max['RU --> EN'] = max_accuracy_dict_ru2en['top-1'], max_accuracy_dict_ru2en['top-3'], max_accuracy_dict_ru2en['top-5']
comparison_t2s_dict_max['ES --> EN'] = max_accuracy_dict_es2en['top-1'], max_accuracy_dict_es2en['top-3'], max_accuracy_dict_es2en['top-5']
comparison_t2s_dict_max['FR --> EN'] = max_accuracy_dict_fr2en['top-1'], max_accuracy_dict_fr2en['top-3'], max_accuracy_dict_fr2en['top-5']
comparison_t2s_dict_max['IT --> EN'] = max_accuracy_dict_it2en['top-1'], max_accuracy_dict_it2en['top-3'], max_accuracy_dict_it2en['top-5']
comparison_t2s_dict_max['DE --> EN'] = max_accuracy_dict_de2en['top-1'], max_accuracy_dict_de2en['top-3'], max_accuracy_dict_de2en['top-5']
comparison_t2s_dict_max['FI --> EN'] = max_accuracy_dict_fi2en['top-1'], max_accuracy_dict_fi2en['top-3'], max_accuracy_dict_fi2en['top-5']
comparison_t2s_dict_max['JA --> EN'] = max_accuracy_dict_ja2en['top-1'], max_accuracy_dict_ja2en['top-3'], max_accuracy_dict_ja2en['top-5']
comparison_t2s_dict_max['HI --> EN'] = max_accuracy_dict_hi2en['top-1'], max_accuracy_dict_hi2en['top-3'], max_accuracy_dict_hi2en['top-5']
comparison_t2s_dict_max['TR --> EN'] = max_accuracy_dict_tr2en['top-1'], max_accuracy_dict_tr2en['top-3'], max_accuracy_dict_tr2en['top-5']
comparison_t2s_dict_max['VI --> EN'] = max_accuracy_dict_vi2en['top-1'], max_accuracy_dict_vi2en['top-3'], max_accuracy_dict_vi2en['top-5']
comparison_t2s_dict_max['LT --> EN'] = max_accuracy_dict_lt2en['top-1'], max_accuracy_dict_lt2en['top-3'], max_accuracy_dict_lt2en['top-5']
comparison_t2s_dict_max['HE --> EN'] = max_accuracy_dict_he2en['top-1'], max_accuracy_dict_he2en['top-3'], max_accuracy_dict_he2en['top-5']
comparison_t2s_dict_max['AR --> EN'] = max_accuracy_dict_ar2en['top-1'], max_accuracy_dict_ar2en['top-3'], max_accuracy_dict_ar2en['top-5']
comparison_t2s_dict_max['IS --> EN'] = max_accuracy_dict_is2en['top-1'], max_accuracy_dict_is2en['top-3'], max_accuracy_dict_is2en['top-5']
comparison_t2s_df_max = pd.DataFrame(comparison_t2s_dict_max, index=['TOP1', 'TOP2', 'TOP3'])
comparison_t2s_df_max = comparison_t2s_df_max.T
print("***********Max Pooling: Target to Source*************")
comparison_t2s_df_max

***********Max Pooling: Target to Source*************


Unnamed: 0,TOP1,TOP2,TOP3
RU --> EN,0.1288,0.2099,0.2332
ES --> EN,0.2132,0.28,0.297
FR --> EN,0.2185,0.3224,0.346
IT --> EN,0.0777,0.1843,0.1975
DE --> EN,0.1688,0.244,0.2609
FI --> EN,0.027908,0.055117,0.064819
JA --> EN,0.0591,0.1038,0.1203
HI --> EN,0.048086,0.091178,0.110043
TR --> EN,0.0365,0.0565,0.0662
VI --> EN,0.208427,0.295335,0.325809


## Mean Pooling: Source(En) to Target

In [0]:
comparison_s2t_dict_mean = dict() 
comparison_s2t_dict_mean['EN --> RU'] = mean_accuracy_dict_en2ru['top-1'], mean_accuracy_dict_en2ru['top-3'], mean_accuracy_dict_en2ru['top-5']
comparison_s2t_dict_mean['EN --> ES'] = mean_accuracy_dict_en2es['top-1'], mean_accuracy_dict_en2es['top-3'], mean_accuracy_dict_en2es['top-5']
comparison_s2t_dict_mean['EN --> FR'] = mean_accuracy_dict_en2fr['top-1'], mean_accuracy_dict_en2fr['top-3'], mean_accuracy_dict_en2fr['top-5']
comparison_s2t_dict_mean['EN --> IT'] = mean_accuracy_dict_en2it['top-1'], mean_accuracy_dict_en2it['top-3'], mean_accuracy_dict_en2it['top-5']
comparison_s2t_dict_mean['EN --> DE'] = mean_accuracy_dict_en2de['top-1'], mean_accuracy_dict_en2de['top-3'], mean_accuracy_dict_en2de['top-5']
comparison_s2t_dict_mean['EN --> FI'] = mean_accuracy_dict_en2fi['top-1'], mean_accuracy_dict_en2fi['top-3'], mean_accuracy_dict_en2fi['top-5']
comparison_s2t_dict_mean['EN --> JA'] = mean_accuracy_dict_en2ja['top-1'], mean_accuracy_dict_en2ja['top-3'], mean_accuracy_dict_en2ja['top-5']
comparison_s2t_dict_mean['EN --> HI'] = mean_accuracy_dict_en2hi['top-1'], mean_accuracy_dict_en2hi['top-3'], mean_accuracy_dict_en2hi['top-5']
comparison_s2t_dict_mean['EN --> TR'] = mean_accuracy_dict_en2tr['top-1'], mean_accuracy_dict_en2tr['top-3'], mean_accuracy_dict_en2tr['top-5']
comparison_s2t_dict_mean['EN --> VI'] = mean_accuracy_dict_en2vi['top-1'], mean_accuracy_dict_en2vi['top-3'], mean_accuracy_dict_en2vi['top-5']
comparison_s2t_dict_mean['EN --> LT'] = mean_accuracy_dict_en2lt['top-1'], mean_accuracy_dict_en2lt['top-3'], mean_accuracy_dict_en2lt['top-5']
comparison_s2t_dict_mean['EN --> HE'] = mean_accuracy_dict_en2he['top-1'], mean_accuracy_dict_en2he['top-3'], mean_accuracy_dict_en2he['top-5']
comparison_s2t_dict_mean['EN --> AR'] = mean_accuracy_dict_en2ar['top-1'], mean_accuracy_dict_en2ar['top-3'], mean_accuracy_dict_en2ar['top-5']
comparison_s2t_dict_mean['EN --> IS'] = mean_accuracy_dict_en2is['top-1'], mean_accuracy_dict_en2is['top-3'], mean_accuracy_dict_en2is['top-5']
comparison_s2t_df_mean = pd.DataFrame(comparison_s2t_dict_mean, index=['TOP1', 'TOP2', 'TOP3'])
comparison_s2t_df_mean = comparison_s2t_df_mean.T
print("***********Mean Pooling: Source(En) to Target*************")
comparison_s2t_df_mean

***********Mean Pooling: Source(En) to Target*************


Unnamed: 0,TOP1,TOP2,TOP3
EN --> RU,0.2346,0.4106,0.4449
EN --> ES,0.3637,0.4992,0.5217
EN --> FR,0.2985,0.4375,0.4663
EN --> IT,0.1789,0.4731,0.5073
EN --> DE,0.2892,0.4482,0.4766
EN --> FI,0.122437,0.254676,0.277083
EN --> JA,0.1195,0.2068,0.2331
EN --> HI,0.143888,0.262068,0.30479
EN --> TR,0.1339,0.1947,0.2191
EN --> VI,0.369451,0.510534,0.550038


## Mean Pooling: Target to Source(En) 

In [0]:
comparison_t2s_dict_mean = dict() 
comparison_t2s_dict_mean['RU --> EN'] = mean_accuracy_dict_ru2en['top-1'], mean_accuracy_dict_ru2en['top-3'], mean_accuracy_dict_ru2en['top-5']
comparison_t2s_dict_mean['ES --> EN'] = mean_accuracy_dict_es2en['top-1'], mean_accuracy_dict_es2en['top-3'], mean_accuracy_dict_es2en['top-5']
comparison_t2s_dict_mean['FR --> EN'] = mean_accuracy_dict_fr2en['top-1'], mean_accuracy_dict_fr2en['top-3'], mean_accuracy_dict_fr2en['top-5']
comparison_t2s_dict_mean['IT --> EN'] = mean_accuracy_dict_it2en['top-1'], mean_accuracy_dict_it2en['top-3'], mean_accuracy_dict_it2en['top-5']
comparison_t2s_dict_mean['DE --> EN'] = mean_accuracy_dict_de2en['top-1'], mean_accuracy_dict_de2en['top-3'], mean_accuracy_dict_de2en['top-5']
comparison_t2s_dict_mean['FI --> EN'] = mean_accuracy_dict_fi2en['top-1'], mean_accuracy_dict_fi2en['top-3'], mean_accuracy_dict_fi2en['top-5']
comparison_t2s_dict_mean['JA --> EN'] = mean_accuracy_dict_ja2en['top-1'], mean_accuracy_dict_ja2en['top-3'], mean_accuracy_dict_ja2en['top-5']
comparison_t2s_dict_mean['HI --> EN'] = mean_accuracy_dict_hi2en['top-1'], mean_accuracy_dict_hi2en['top-3'], mean_accuracy_dict_hi2en['top-5']
comparison_t2s_dict_mean['TR --> EN'] = mean_accuracy_dict_tr2en['top-1'], mean_accuracy_dict_tr2en['top-3'], mean_accuracy_dict_tr2en['top-5']
comparison_t2s_dict_mean['VI --> EN'] = mean_accuracy_dict_vi2en['top-1'], mean_accuracy_dict_vi2en['top-3'], mean_accuracy_dict_vi2en['top-5']
comparison_t2s_dict_mean['LT --> EN'] = mean_accuracy_dict_lt2en['top-1'], mean_accuracy_dict_lt2en['top-3'], mean_accuracy_dict_lt2en['top-5']
comparison_t2s_dict_mean['HE --> EN'] = mean_accuracy_dict_he2en['top-1'], mean_accuracy_dict_he2en['top-3'], mean_accuracy_dict_he2en['top-5']
comparison_t2s_dict_mean['AR --> EN'] = mean_accuracy_dict_ar2en['top-1'], mean_accuracy_dict_ar2en['top-3'], mean_accuracy_dict_ar2en['top-5']
comparison_t2s_dict_mean['IS --> EN'] = mean_accuracy_dict_is2en['top-1'], mean_accuracy_dict_is2en['top-3'], mean_accuracy_dict_is2en['top-5']
comparison_t2s_df_mean = pd.DataFrame(comparison_t2s_dict_mean, index=['TOP1', 'TOP2', 'TOP3'])
comparison_t2s_df_mean = comparison_t2s_df_mean.T
print("***********Mean Pooling: Target to Source*************")
comparison_t2s_df_mean

***********Mean Pooling: Target to Source*************


Unnamed: 0,TOP1,TOP2,TOP3
RU --> EN,0.15,0.2554,0.2818
ES --> EN,0.229,0.3171,0.3375
FR --> EN,0.2337,0.3561,0.379
IT --> EN,0.0845,0.2223,0.2376
DE --> EN,0.1878,0.2905,0.3114
FI --> EN,0.044813,0.082625,0.096829
JA --> EN,0.0814,0.1391,0.1585
HI --> EN,0.070649,0.125948,0.150731
TR --> EN,0.0524,0.0815,0.0944
VI --> EN,0.265613,0.362302,0.400677
