In [4]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

embedder = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
embeddings = embedder.encode(['Hello World', 'Hallo Welt', 'Hola mundo'])
print(embeddings)

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.76214471e-01  1.20601423e-01 -2.93623894e-01 -2.29858086e-01
 -8.22924674e-02  2.37709492e-01  3.39985222e-01 -7.80964196e-01
  1.18127503e-01  1.63373962e-01 -1.37715206e-01  2.40282550e-01
  4.25125659e-01  1.72418088e-01  1.05279632e-01  5.18164039e-01
  6.22218512e-02  3.99285495e-01 -1.81652248e-01 -5.85578799e-01
  4.49720435e-02 -1.72750235e-01 -2.68443555e-01 -1.47386104e-01
 -1.89218029e-01  1.92150757e-01 -3.83842319e-01 -3.96007061e-01
  4.30648863e-01 -3.15319747e-01  3.65949780e-01  6.05159029e-02
  3.57325673e-01  1.59736469e-01 -3.00983876e-01  2.63250172e-01
 -3.94311070e-01  1.84855551e-01 -3.99549127e-01 -2.67889470e-01
 -5.45117080e-01 -3.13406326e-02 -4.30644304e-01  1.33278280e-01
 -1.74793869e-01 -4.35465455e-01 -4.77379113e-01  7.12556392e-02
 -7.37002343e-02  5.69136918e-01 -2.82579750e-01  5.24974838e-02
 -8.20007920e-01  1.98296934e-01  1.69511974e-01  2.71780193e-01
  2.64611

[[ 0.17623387 -0.23755082 -0.25186118 ...  0.02418864 -0.05202759
  -0.13542381]
 [-0.06796029 -0.45643526 -0.2081763  ... -0.3206765   0.10556407
   0.02621567]
 [-0.14879279 -0.171958   -0.4128398  ... -0.02332733  0.01705487
  -0.07746394]]
The cat sits outside 		 The dog plays in the garden 		 Score: 0.2853
A man is playing guitar 		 A woman watches TV 		 Score: -0.0810
The new movie is awesome 		 The new movie is so great 		 Score: 0.9298


In [5]:
"""
This script contains an example how to extend an existent sentence embedding model to new languages.
Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
on multiple languages.
For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
with the first column a sentence in a language understood by the teacher model, e.g. English,
and the further columns contain the according translations for languages you want to extend to.
This scripts downloads automatically the TED2020 corpus: https://github.com/UKPLab/sentence-transformers/blob/master/docs/datasets/TED2020.md
This corpus contains transcripts from
TED and TEDx talks, translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts
Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""

from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime

import os
import logging
import sentence_transformers.util
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import zipfile
import io

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)


teacher_model_name = 'paraphrase-distilroberta-base-v2'   #Our monolingual teacher model, we want to convert to multiple languages
student_model_name = 'xlm-roberta-base'       #Multilingual base model we use to imitate the teacher model

max_seq_length = 128                #Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64               #Batch size for training
inference_batch_size = 64           #Batch size at inference
max_sentences_per_language = 500000 #Maximum number of  parallel sentences for training
train_max_sentence_length = 250     #Maximum length (characters) for parallel training sentences

num_epochs = 5                       #Train for x epochs
num_warmup_steps = 10000             #Warumup steps

num_evaluation_steps = 1000          #Evaluate performance after every xxxx steps
dev_sentences = 1000                 #Number of parallel sentences to be used for development


# Define the language codes you would like to extend the model to
source_languages = set(['en'])                      # Our teacher model accepts English (en) sentences
target_languages = set(['sv'])    # We want to extend the model to these new languages. For language codes, see the header of the train file


output_path = "output/make-multilingual-"+"-".join(sorted(list(source_languages))+sorted(list(target_languages)))+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


# This function downloads a corpus if it does not exist
def download_corpora(filepaths):
    if not isinstance(filepaths, list):
        filepaths = [filepaths]

    for filepath in filepaths:
        if not os.path.exists(filepath):
            print(filepath, "does not exists. Try to download from server")
            filename = os.path.basename(filepath)
            url = "https://sbert.net/datasets/" + filename
            sentence_transformers.util.http_get(url, filepath)


# Here we define train train and dev corpora
train_corpus = "datasets/ted2020.tsv.gz"         # Transcripts of TED talks, crawled 2020
sts_corpus = "datasets/STS2017-extended.zip"     # Extended STS2017 dataset for more languages
parallel_sentences_folder = "parallel-sentences/"

# Check if the file exists. If not, they are downloaded
download_corpora([train_corpus, sts_corpus])


# Create parallel files for the selected language combinations
os.makedirs(parallel_sentences_folder, exist_ok=True)
train_files = []
dev_files = []
files_to_create = []
for source_lang in source_languages:
    for target_lang in target_languages:
        output_filename_train = os.path.join(parallel_sentences_folder, "TED2020-{}-{}-train.tsv.gz".format(source_lang, target_lang))
        output_filename_dev = os.path.join(parallel_sentences_folder, "TED2020-{}-{}-dev.tsv.gz".format(source_lang, target_lang))
        train_files.append(output_filename_train)
        dev_files.append(output_filename_dev)
        if not os.path.exists(output_filename_train) or not os.path.exists(output_filename_dev):
            files_to_create.append({'src_lang': source_lang, 'trg_lang': target_lang,
                                    'fTrain': gzip.open(output_filename_train, 'wt', encoding='utf8'),
                                    'fDev': gzip.open(output_filename_dev, 'wt', encoding='utf8'),
                                    'devCount': 0
                                    })

if len(files_to_create) > 0:
    print("Parallel sentences files {} do not exist. Create these files now".format(", ".join(map(lambda x: x['src_lang']+"-"+x['trg_lang'], files_to_create))))
    with gzip.open(train_corpus, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for line in tqdm(reader, desc="Sentences"):
            for outfile in files_to_create:
                src_text = line[outfile['src_lang']].strip()
                trg_text = line[outfile['trg_lang']].strip()

                if src_text != "" and trg_text != "":
                    if outfile['devCount'] < dev_sentences:
                        outfile['devCount'] += 1
                        fOut = outfile['fDev']
                    else:
                        fOut = outfile['fTrain']

                    fOut.write("{}\t{}\n".format(src_text, trg_text))

    for outfile in files_to_create:
        outfile['fTrain'].close()
        outfile['fDev'].close()



######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)


logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True)
for train_file in train_files:
    train_data.load_data(train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)



#### Evaluate cross-lingual performance on different tasks #####
evaluators = []         #evaluators has a list of different evaluator classes we call periodically

for dev_file in dev_files:
    logger.info("Create evaluator for " + dev_file)
    src_sentences = []
    trg_sentences = []
    with gzip.open(dev_file, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split('\t')
            if splits[0] != "" and splits[1] != "":
                src_sentences.append(splits[0])
                trg_sentences.append(splits[1])


    #Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
    dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file), teacher_model=teacher_model, batch_size=inference_batch_size)
    evaluators.append(dev_mse)

    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
    dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file),batch_size=inference_batch_size)
    evaluators.append(dev_trans_acc)


##### Read cross-lingual Semantic Textual Similarity (STS) data ####
all_languages = list(set(list(source_languages)+list(target_languages)))
sts_data = {}

#Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
with zipfile.ZipFile(sts_corpus) as zip:
    filelist = zip.namelist()
    sts_files = []

    for i in range(len(all_languages)):
        for j in range(i, len(all_languages)):
            lang1 = all_languages[i]
            lang2 = all_languages[j]
            filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)
            if filepath not in filelist:
                lang1, lang2 = lang2, lang1
                filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)

            if filepath in filelist:
                filename = os.path.basename(filepath)
                sts_data[filename] = {'sentences1': [], 'sentences2': [], 'scores': []}

                fIn = zip.open(filepath)
                for line in io.TextIOWrapper(fIn, 'utf8'):
                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]['sentences1'].append(sent1)
                    sts_data[filename]['sentences2'].append(sent2)
                    sts_data[filename]['scores'].append(score)

for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(data['sentences1'], data['sentences2'], data['scores'], batch_size=inference_batch_size, name=filename, show_progress_bar=False)
    evaluators.append(test_evaluator)


# Train the model
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
          epochs=num_epochs,
          warmup_steps=num_warmup_steps,
          evaluation_steps=num_evaluation_steps,
          output_path=output_path,
          save_best_model=True,
          optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
          )

datasets/ted2020.tsv.gz does not exists. Try to download from server


100%|████████████████████████████████████████| 581M/581M [00:16<00:00, 34.7MB/s]


datasets/STS2017-extended.zip does not exists. Try to download from server


100%|██████████████████████████████████████| 96.3k/96.3k [00:00<00:00, 1.11MB/s]


Parallel sentences files en-sv do not exist. Create these files now


Sentences: 427436it [00:26, 16377.91it/s]


2022-02-08 19:05:03 - Load teacher model
2022-02-08 19:05:03 - Load pretrained SentenceTransformer: paraphrase-distilroberta-base-v2


Downloading: 100%|██████████████████████████████| 736/736 [00:00<00:00, 178kB/s]
Downloading: 100%|█████████████████████████| 3.74k/3.74k [00:00<00:00, 1.02MB/s]
Downloading: 100%|██████████████████████████████| 686/686 [00:00<00:00, 173kB/s]
Downloading: 100%|█████████████████████████████| 122/122 [00:00<00:00, 31.6kB/s]
Downloading: 100%|████████████████████████████| 456k/456k [00:00<00:00, 938kB/s]
Downloading: 100%|█████████████████████████████| 229/229 [00:00<00:00, 58.8kB/s]
Downloading: 100%|███████████████████████████| 329M/329M [00:06<00:00, 51.2MB/s]
Downloading: 100%|███████████████████████████| 53.0/53.0 [00:00<00:00, 14.0kB/s]
Downloading: 100%|█████████████████████████████| 239/239 [00:00<00:00, 62.9kB/s]
Downloading: 100%|█████████████████████████| 1.36M/1.36M [00:00<00:00, 2.46MB/s]
Downloading: 100%|██████████████████████████| 1.12k/1.12k [00:00<00:00, 285kB/s]
Downloading: 100%|███████████████████████████| 798k/798k [00:00<00:00, 1.56MB/s]
Downloading: 100%|██████████

2022-02-08 19:05:24 - Use pytorch device: cpu
2022-02-08 19:05:24 - Create student model from scratch


Downloading: 100%|██████████████████████████████| 512/512 [00:00<00:00, 130kB/s]
Downloading: 100%|█████████████████████████| 1.04G/1.04G [01:10<00:00, 15.8MB/s]
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|█████████████████████████| 4.83M/4.83M [00:00<00:00, 6.03MB/s]
Downloading: 100%|███████████████████████

2022-02-08 19:06:47 - Use pytorch device: cpu
2022-02-08 19:06:47 - Load parallel-sentences/TED2020-en-sv-train.tsv.gz
2022-02-08 19:06:48 - Create evaluator for parallel-sentences/TED2020-en-sv-dev.tsv.gz


Epoch:   0%|                                              | 0/5 [00:00<?, ?it/s]
  labels = torch.tensor(labels).to(self._target_device)

Iteration:   0%|                            | 1/3524 [01:05<64:06:26, 65.51s/it][A
Iteration:   0%|                            | 2/3524 [01:58<57:50:15, 59.12s/it][A
Iteration:   0%|                            | 3/3524 [02:48<54:47:38, 56.02s/it][A
Iteration:   0%|                            | 4/3524 [03:46<55:11:26, 56.44s/it][A
Iteration:   0%|                            | 5/3524 [04:31<52:41:24, 53.90s/it][A
Iteration:   0%|                            | 6/3524 [05:20<51:51:36, 53.07s/it][A
Iteration:   0%|                            | 7/3524 [06:06<50:40:50, 51.88s/it][A
Iteration:   0%|                            | 8/3524 [06:53<49:58:54, 51.18s/it][A
Iteration:   0%|                            | 9/3524 [07:33<48:29:36, 49.67s/it][A
Iteration:   0%|                           | 10/3524 [08:17<47:47:44, 48.97s/it][A
Iteration:   0%|      

KeyboardInterrupt: 