In [1]:
!pip install transformers



In [2]:
import torch
from transformers import BertTokenizer, BertModel

import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
% matplotlib inline

# Load pre-trained model tokenizers (vocabulary)
mbert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
msbert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [3]:
# Load pre-trained models (weights)
mbert = BertModel.from_pretrained('bert-base-multilingual-uncased',
                                  output_hidden_states = True,
                                  )
msbert = BertModel.from_pretrained('DeepPavlov/bert-base-multilingual-cased-sentence',
                                  output_hidden_states = True,
                                  )

In [4]:
mbert.eval(); msbert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

## Toy example

In [5]:
# This is a car
text1 = "Това е кола."
text2 = "Dies ist ein Auto."

# Tokenize our sentence with the BERT tokenizer.
mbert_tokenized_text1 = mbert_tokenizer(text1, padding=True, return_tensors="pt")
mbert_tokenized_text2 = mbert_tokenizer(text2, padding=True, return_tensors="pt")

# Print out the tokens.
print(mbert_tokenized_text1)
print(mbert_tokenized_text2)

{'input_ids': tensor([[  101, 15036,   312, 63896,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101, 13015, 10339, 10299, 14929,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [6]:
with torch.no_grad():
    outputs1 = mbert(**mbert_tokenized_text1)
    embedding_1 = outputs1.pooler_output

    outputs2 = mbert(**mbert_tokenized_text2)
    embedding_2 = outputs2.pooler_output

In [7]:
cosine_sim = torch.nn.CosineSimilarity() 
cosine_sim(embedding_1, embedding_2)

tensor([0.9897])

In [8]:
# This is a car
text1 = "Това е кола."
text2 = "Dies ist ein Auto."

# Tokenize our sentence with the BERT tokenizer.
msbert_tokenized_text1 = msbert_tokenizer(text1, padding=True, return_tensors="pt")
msbert_tokenized_text2 = msbert_tokenizer(text2, padding=True, return_tensors="pt")

# Print out the tokens.
print(msbert_tokenized_text1)
print(msbert_tokenized_text2)

{'input_ids': tensor([[  101, 36231,   546, 79123,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101, 18231, 10298, 10290, 23265,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [9]:
with torch.no_grad():
    outputs1 = msbert(**msbert_tokenized_text1)
    embedding_1 = outputs1.pooler_output

    outputs2 = msbert(**msbert_tokenized_text2)
    embedding_2 = outputs2.pooler_output

In [10]:
cosine_sim(embedding_1, embedding_2)

tensor([0.5499])

## Testing on dataset


In [11]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
cd './drive/My Drive/Colab Notebooks/CPSC 532S Project/'

/content/drive/My Drive/Colab Notebooks/CPSC 532S Project


In [13]:
data = pd.read_csv("data/xnli.15way.orig.tsv", delimiter="\t")

In [14]:
bg = data["bg"]
en = data["en"]
de = data["de"]
sw = data["sw"]
ar = data["ar"]
vi = data["vi"]
zh = data["zh"]

In [15]:
data.shape

(10000, 15)

### Multilingual Bert Base (uncased)




In [17]:
similarities = []

for i in range(1000):
   if i%100 == 0:
     print("{}/1000".format(i))
   sentences = [bg[i]] + [en[i]] + [de[i]] + [sw[i]] + [ar[i]] + [vi[i]] + [zh[i]]
   with torch.no_grad():
      tokenized_sentences = mbert_tokenizer(sentences,
                                            padding=True,
                                            return_tensors="pt")
      embeddings = mbert(**tokenized_sentences).pooler_output

   s = []
   for n in range(embeddings.shape[0]):
     for m in range(n+1, embeddings.shape[0]):
       s.append(cosine_sim(embeddings[n:n+1], embeddings[m:m+1]))

   similarities.append(s)

0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


In [18]:
similarities = torch.Tensor(similarities)

In [19]:
mean_sim = torch.mean(similarities, dim=0)

In [20]:
languages = ["Bulgarian -- English", "Bulgarian -- German",
             "Bulgarian -- Swahili", "Bulgarian -- Arabic",
             "Bulgarian -- Vietnamese", "Bulgarian -- Mandarin",
             "English   -- German", "English   -- Swahili", 
             "English   -- Arabic", "English   -- Vietnamese",
             "English   -- Mandarin", "German    -- Swahili", 
             "German    -- Arabic", "German    -- Vietnamese",
             "German    -- Mandarin", "Swahili   -- Arabic",
             "Swahili   -- Vietnamese", "Swahili   -- Mandarin",
             "Arabic    -- Vietnamese", "Arabic    -- Mandarin",
             "Vietnamese -- Mandarin"]

for i in range(len(languages)):
  print("{:<30} mean similarity   {:.4f}".format(languages[i], mean_sim[i]))

Bulgarian -- English           mean similarity   0.9464
Bulgarian -- German            mean similarity   0.9647
Bulgarian -- Swahili           mean similarity   0.9413
Bulgarian -- Arabic            mean similarity   0.9661
Bulgarian -- Vietnamese        mean similarity   0.9686
Bulgarian -- Mandarin          mean similarity   0.8772
English   -- German            mean similarity   0.9633
English   -- Swahili           mean similarity   0.9046
English   -- Arabic            mean similarity   0.9418
English   -- Vietnamese        mean similarity   0.9395
English   -- Mandarin          mean similarity   0.8149
German    -- Swahili           mean similarity   0.9237
German    -- Arabic            mean similarity   0.9576
German    -- Vietnamese        mean similarity   0.9569
German    -- Mandarin          mean similarity   0.8344
Swahili   -- Arabic            mean similarity   0.9533
Swahili   -- Vietnamese        mean similarity   0.9539
Swahili   -- Mandarin          mean similarity  

### Multilingual Bert Sentence (cased)

In [21]:
similarities = []

for i in range(1000):
   if i%100 == 0:
     print("{}/1000".format(i))
   
   sentences = [bg[i]] + [en[i]] + [de[i]] + [sw[i]] + [ar[i]] + [vi[i]] + [zh[i]]
   with torch.no_grad():
      tokenized_sentences = msbert_tokenizer(sentences,
                                            padding=True,
                                            return_tensors="pt")
      embeddings = msbert(**tokenized_sentences).pooler_output

   s = []
   for n in range(embeddings.shape[0]):
     for m in range(n+1, embeddings.shape[0]):
       s.append(cosine_sim(embeddings[n:n+1], embeddings[m:m+1]))

   similarities.append(s)

0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


In [22]:
similarities = torch.Tensor(similarities)
mean_sim = torch.mean(similarities, dim=0)

In [23]:
for i in range(len(languages)):
    print("{:<30} mean similarity   {:.4f}".format(languages[i], mean_sim[i]))

Bulgarian -- English           mean similarity   0.9033
Bulgarian -- German            mean similarity   0.9018
Bulgarian -- Swahili           mean similarity   0.8193
Bulgarian -- Arabic            mean similarity   0.8869
Bulgarian -- Vietnamese        mean similarity   0.8924
Bulgarian -- Mandarin          mean similarity   0.8837
English   -- German            mean similarity   0.9166
English   -- Swahili           mean similarity   0.7939
English   -- Arabic            mean similarity   0.8779
English   -- Vietnamese        mean similarity   0.9015
English   -- Mandarin          mean similarity   0.9005
German    -- Swahili           mean similarity   0.8083
German    -- Arabic            mean similarity   0.8799
German    -- Vietnamese        mean similarity   0.8925
German    -- Mandarin          mean similarity   0.8865
Swahili   -- Arabic            mean similarity   0.8164
Swahili   -- Vietnamese        mean similarity   0.8045
Swahili   -- Mandarin          mean similarity  