In [1]:
import datasets

dataset = datasets.load_dataset('ted_multi', split='train')
dataset

Dataset({
    features: ['translations', 'talk_name'],
    num_rows: 258098
})

In [2]:
dataset[0]

{'translations': {'language': ['ar',
   'bg',
   'de',
   'el',
   'en',
   'es',
   'eu',
   'fa',
   'fr',
   'fr-ca',
   'he',
   'hr',
   'hu',
   'it',
   'ja',
   'ko',
   'nb',
   'nl',
   'pl',
   'pt',
   'pt-br',
   'ro',
   'ru',
   'sq',
   'tr',
   'vi',
   'zh-cn',
   'zh-tw'],
  'translation': ['من ضمن جميع المثبطات المقلقة التي نعاني منها اليوم نفكر في المقام الاول في الامور المالية والاقتصادية واكثر ما يهمني بشكل اكثر هو عجز الحوار السياسي — قدرتنا على فهم الصراعات الحديثة على ماهي عليه , بالذهاب الى اصلها الفعلي وعلى فهم اللاعبين الرئيسيين وعلى التعامل معهم',
   'Наред с всички обезпокоителни дефицити , с които се сблъскваме днес - ние основно мислим за финансовите и икономическите - този , който ме безпокои най-вече е липсата на политически диалог - нашата способност да подходим към съвременните конфликти както те присъстват , да стигнем до източника на това , от което те произтичат и да разберем ключовите участници и да се разберем с тях .',
   'Unter den schwierige

In [3]:
from sentence_transformers import InputExample
from tqdm import tqdm

langs = ['it', 'es', 'ar', 'fr', 'de']
train_examples = {f'en-{lang}':[] for lang in langs}
for row in tqdm(dataset):
    idx = row['translations']['language'].index('en')
    source = row['translations']['translation'][idx].strip()
    for lang in row['translations']['language']:
        i = row['translations']['language'].index(lang)
        if lang in langs:
            train_examples[f'en-{lang}'].append(source + '\t' + row['translations']['translation'][i].strip())

100%|██████████| 258098/258098 [00:16<00:00, 16094.87it/s]


In [4]:
for lang_pair in train_examples.keys():
    print(lang_pair, '->', len(train_examples[lang_pair]))

en-it -> 204503
en-es -> 196026
en-ar -> 214111
en-fr -> 192304
en-de -> 167888


In [5]:
train_examples[lang_pair][1]

'We who are diplomats , we are trained to deal with conflicts between states and issues between states .\tWir Diplomaten sind dazu ausgebildet worden , mit Streitigkeiten zwischen Staaten umzugehen .'

In [24]:
import gzip
import os

if not os.path.exists('./data'):
    os.mkdir('./data')

for lang_pair in train_examples.keys():
    with gzip.open(f'./data/ted-train-{lang_pair}.tsv.gz', 'wt', encoding='utf-8') as dest:
        print(len(train_examples[lang_pair]))
        dest.write('\n'.join(train_examples[lang_pair][:2000]))

204503
196026
214111
192304
167888


In [25]:
from transformers import AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [26]:
sentences = [
    'we will include several languages',
    '一些中文单词',
    'το ελληνικό αλφάβητο είναι πολύ ωραίο',
    'ჩვენ გვაქვს ქართული'
]

for text in sentences:
    print(bert_tokenizer.tokenize(text))

['we', 'will', 'include', 'several', 'languages']
['一', '[UNK]', '中', '文', '[UNK]', '[UNK]']
['τ', '##ο', 'ε', '##λ', '##λ', '##η', '##ν', '##ι', '##κ', '##ο', 'α', '##λ', '##φ', '##α', '##β', '##η', '##τ', '##ο', 'ε', '##ι', '##ν', '##α', '##ι', 'π', '##ο', '##λ', '##υ', 'ω', '##ρ', '##α', '##ι', '##ο']
['[UNK]', '[UNK]', '[UNK]']


In [27]:
xlmr_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [28]:
for text in sentences:
    print(xlmr_tokenizer.tokenize(text))

['▁we', '▁will', '▁include', '▁several', '▁language', 's']
['▁', '一些', '中文', '单', '词']
['▁το', '▁ελληνικό', '▁αλ', 'φά', 'βη', 'το', '▁είναι', '▁πολύ', '▁ωραίο']
['▁ჩვენ', '▁გვაქვს', '▁ქართული']


In [29]:
from sentence_transformers import models, SentenceTransformer

xlmr = models.Transformer('xlm-roberta-base')
pooling = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
student = SentenceTransformer(modules=[xlmr, pooling])
student

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [30]:
teacher = SentenceTransformer('all-mpnet-base-v2')
teacher

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [31]:
teacher = SentenceTransformer('paraphrase-distilroberta-base-v2')
teacher

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [32]:
from sentence_transformers import ParallelSentencesDataset

data = ParallelSentencesDataset(teacher_model=teacher, student_model=student,
                                    batch_size=32, use_embedding_cache=True)

In [33]:
max_sentences_per_language = 500000
max_sentence_length = 250
train_files = [f for f in os.listdir('./data') if 'train' in f]
for train_file in train_files:
    print(train_file)
    data.load_data('./data/'+train_file, max_sentences=max_sentences_per_language,
                   max_sentence_length=max_sentence_length)

len(data)

ted-train-en-ar.tsv.gz
ted-train-en-de.tsv.gz
ted-train-en-es.tsv.gz
ted-train-en-fr.tsv.gz
ted-train-en-it.tsv.gz


18608

In [34]:
from torch.utils.data import DataLoader
loader = DataLoader(data, batch_size=32, shuffle=True) 
len(loader)

582

In [35]:
from sentence_transformers import losses

loss = losses.MSELoss(model=student)

In [37]:
from sentence_transformers import evaluation
import numpy as np

epochs = 1
warmup_steps = int(len(loader)*epochs*0.1)

student.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./xmlr-ted',
    optimizer_params={'lr': 2e-5, 'eps': 1e-6},
    save_best_model=True,
    show_progress_bar=True
) 

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/582 [00:00<?, ?it/s]

In [39]:
import datasets

en = datasets.load_dataset('stsb_multi_mt', 'en', split='test')
en 

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [41]:
it = datasets.load_dataset('stsb_multi_mt', 'it', split='test')
it

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [43]:
en = en.map(lambda x: {'similarity_score': x['similarity_score'] / 5.0})
it = it.map(lambda x: {'similarity_score': x['similarity_score'] / 5.0})

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [44]:
from sentence_transformers import InputExample

en_samples = []
it_samples = []
en_it_samples = []

for i in range(len(en)):
    en_samples.append(InputExample(
        texts=[en[i]['sentence1'], en[i]['sentence2']],
        label=en[i]['similarity_score']
    ))
    it_samples.append(InputExample(
        texts=[it[i]['sentence1'], it[i]['sentence2']],
        label=it[i]['similarity_score']
    ))
    en_it_samples.append(InputExample(
        texts=[en[i]['sentence1'], it[i]['sentence2']],
        label=en[i]['similarity_score']
    ))

In [45]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

en_eval = EmbeddingSimilarityEvaluator.from_input_examples(
    en_samples, write_csv=False
)
it_eval = EmbeddingSimilarityEvaluator.from_input_examples(
    it_samples, write_csv=False
)
en_it_eval = EmbeddingSimilarityEvaluator.from_input_examples(
    en_it_samples, write_csv=False
)

In [47]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./xmlr-ted')

en_eval(model)

0.20577197595832455

In [48]:
it_eval(model)

0.23425532464366874

In [49]:
en_it_eval(model)

0.1036000454584871

In [50]:
from sentence_transformers import models

xlmr = models.Transformer('xlm-roberta-base')
pooler = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

student = SentenceTransformer(modules=[xlmr, pooler])

In [51]:
en_eval(student)

0.47525931826733264

In [52]:
it_eval(student)

0.4963748045018903

In [53]:
en_it_eval(student)

0.2297664675626828