### This playground is for Multi-Lingual Sentence Embeddings


### LaBSE Model
Resources: <br>
[1] https://huggingface.co/sentence-transformers/LaBSE <br>
[2] https://arxiv.org/abs/2007.01852

In [1]:
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel

In [2]:
labse_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
labse_model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

In [3]:
labse_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(501153, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [4]:
cs_sim = torch.nn.CosineSimilarity(dim=0)

In [5]:
sentences = ["Hello World", "Hallo Welt"]

encoded_input = labse_tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors='pt')
with torch.no_grad():
    model_output = labse_model(**encoded_input)

embeddings = model_output.pooler_output
embeddings = torch.nn.functional.normalize(embeddings)

cosine_similarity = cs_sim(embeddings[0], embeddings[1])
print("Cosine Similarity: ", cosine_similarity.item())

Cosine Similarity:  0.965174674987793


### Evaluation on XNLI Dataset

In [6]:
xnli = pd.read_csv("data/xnli.15way.orig.tsv", delimiter="\t")

In [7]:
bg, en, de, sw, ar, vi, zh = xnli["bg"], xnli["en"], xnli["de"], xnli["sw"], xnli["ar"], xnli["vi"], xnli["zh"]

In [8]:
cs_sim = torch.nn.CosineSimilarity()

In [9]:
similarities = []

for i in range(1000):      
    sentences = [bg[i]] + [en[i]] + [de[i]] + [sw[i]] + [ar[i]] + [vi[i]] + [zh[i]]
    with torch.no_grad():
        tokenized_sentences = labse_tokenizer(sentences,
                                              padding=True,
                                              return_tensors="pt")
        embeddings = labse_model(**tokenized_sentences).pooler_output

    similarity = []
    for n in range(embeddings.shape[0]):
        for m in range(n+1, embeddings.shape[0]):
            similarity.append(cs_sim(embeddings[n:n+1], embeddings[m:m+1]))
    similarities.append(similarity)

In [10]:
similarities = torch.Tensor(similarities)
mean_sim = torch.mean(similarities, dim=0)
languages = ["Bulgarian -- English", "Bulgarian -- German",
             "Bulgarian -- Swahili", "Bulgarian -- Arabic",
             "Bulgarian -- Vietnamese", "Bulgarian -- Mandarin",
             "English   -- German", "English   -- Swahili", 
             "English   -- Arabic", "English   -- Vietnamese",
             "English   -- Mandarin", "German    -- Swahili", 
             "German    -- Arabic", "German    -- Vietnamese",
             "German    -- Mandarin", "Swahili   -- Arabic",
             "Swahili   -- Vietnamese", "Swahili   -- Mandarin",
             "Arabic    -- Vietnamese", "Arabic    -- Mandarin",
             "Vietnamese -- Mandarin"]

for i in range(len(languages)):
    print("{:<30} mean similarity   {:.4f}".format(languages[i], mean_sim[i]))

Bulgarian -- English           mean similarity   0.8657
Bulgarian -- German            mean similarity   0.8599
Bulgarian -- Swahili           mean similarity   0.7982
Bulgarian -- Arabic            mean similarity   0.8112
Bulgarian -- Vietnamese        mean similarity   0.8282
Bulgarian -- Mandarin          mean similarity   0.8029
English   -- German            mean similarity   0.8582
English   -- Swahili           mean similarity   0.8135
English   -- Arabic            mean similarity   0.8413
English   -- Vietnamese        mean similarity   0.8268
English   -- Mandarin          mean similarity   0.8224
German    -- Swahili           mean similarity   0.8055
German    -- Arabic            mean similarity   0.8007
German    -- Vietnamese        mean similarity   0.8448
German    -- Mandarin          mean similarity   0.7858
Swahili   -- Arabic            mean similarity   0.7629
Swahili   -- Vietnamese        mean similarity   0.7971
Swahili   -- Mandarin          mean similarity  