In [1]:
!pip install -q sentence-transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import datasets
ted = datasets.load_dataset('ted_multi', split='train')
ted

Dataset({
    features: ['translations', 'talk_name'],
    num_rows: 258098
})

In [8]:
ted[10]['translations'].keys()

dict_keys(['language', 'translation'])

In [9]:
from tqdm import tqdm
from collections import defaultdict

lang_list = ['it', 'es', 'ar', 'fr', 'de']
train_examples = defaultdict(list)
for row in tqdm(ted):
  en_idx = row['translations']['language'].index('en')
  src = row['translations']['translation'][en_idx]
  for i, lang in enumerate(row['translations']['language']):
    if i == en_idx:
      continue
    if lang in lang_list:
      target = row['translations']['translation'][i]
      train_examples[f"en-{lang}"].append(src + '\t' + target)

100%|██████████| 258098/258098 [00:39<00:00, 6521.13it/s]


In [10]:
for lpair in train_examples:
  print(f"lang:{lpair}, num of samples: {len(train_examples[lpair])}")

lang:en-ar, num of samples: 214111
lang:en-de, num of samples: 167888
lang:en-es, num of samples: 196026
lang:en-fr, num of samples: 192304
lang:en-it, num of samples: 204503


In [11]:
src + '\t' + target

'( Applause )\t( Applausi )'

In [13]:
import gzip
import os

if not os.path.exists('./data'):
  os.mkdir('./data')

for lpair in train_examples.keys():
  with gzip.open(f"./data/ted-train-{lpair}.tsv.gz", 'wt', encoding='utf-8') as f:
    f.write('\n'.join(train_examples[lpair]))


In [None]:
from transformers import AutoTokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [25]:
sentences = [
    'we will include several languages',
    '一些中文单词',
    'το ελληνικό αλφάβητο είναι πολύ ωραίο',
    'ჩვენ გვაქვს ქართული',
    'தமிழ்நாடு செய்திகள்'
]


In [26]:
for s in sentences:
  tokens = bert_tokenizer.encode(s)
  decoded_tokens = bert_tokenizer.decode(tokens, skip_special_tokens=True)
  print('original:', s)
  print('decoded:', decoded_tokens)
  print('--')

original: we will include several languages
decoded: we will include several languages
--
original: 一些中文单词
decoded: 一 中 文
--
original: το ελληνικό αλφάβητο είναι πολύ ωραίο
decoded: το ελληνικο αλφαβητο ειναι πολυ ωραιο
--
original: ჩვენ გვაქვს ქართული
decoded: 
--
original: தமிழ்நாடு செய்திகள்
decoded: 
--


In [27]:
xlm_roberta_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')



In [28]:
for s in sentences:
  tokens = xlm_roberta_tokenizer.encode(s)
  decoded_tokens = xlm_roberta_tokenizer.decode(tokens, skip_special_tokens=True)
  print('original:', s)
  print('decoded:', decoded_tokens)
  print('--')

original: we will include several languages
decoded: we will include several languages
--
original: 一些中文单词
decoded: 一些中文单词
--
original: το ελληνικό αλφάβητο είναι πολύ ωραίο
decoded: το ελληνικό αλφάβητο είναι πολύ ωραίο
--
original: ჩვენ გვაქვს ქართული
decoded: ჩვენ გვაქვს ქართული
--
original: தமிழ்நாடு செய்திகள்
decoded: தமிழ்நாடு செய்திகள்
--


In [35]:
from sentence_transformers import models, SentenceTransformer
xlmr = models.Transformer('xlm-roberta-base')
pooling = models.Pooling(xlmr.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
student = SentenceTransformer(modules=[xlmr, pooling])
student



SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [37]:
teacher = SentenceTransformer('all-mpnet-base-v2')
teacher



SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [42]:
from sentence_transformers import ParallelSentencesDataset
data = ParallelSentencesDataset(student_model=student, teacher_model=teacher, batch_size=2,
                                use_embedding_cache=True)

In [44]:
max_sentences_per_language = 500000
train_max_sentence_length = 250

train_files = [f for f in os.listdir('./data') if 'train' in f]
for f in train_files:
  print(f)
  data.load_data('./data/'+f, max_sentences=max_sentences_per_language,
                 max_sentence_length=train_max_sentence_length)

ted-train-en-es.tsv.gz
ted-train-en-ar.tsv.gz
ted-train-en-fr.tsv.gz
ted-train-en-it.tsv.gz
ted-train-en-de.tsv.gz


In [45]:
len(data)

1798640

In [46]:
from torch.utils.data import DataLoader
loader = DataLoader(data, batch_size=2, shuffle=True)
len(loader)

899320

In [47]:
from sentence_transformers import losses
loss_func = losses.MSELoss(model=student)

In [None]:
import torch
epochs = 1
warmup_steps = int(len(loader)*epochs*0.1)
student.fit(
    train_objectives=[(loader, loss_func)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path = './xlmr-ted',
    save_best_model=True,
    show_progress_bar=True,
    optimizer_params={'lr': 2e-5, 'eps': 1e-6},
)

In [60]:
student.save('./xlmr-ted')

In [None]:
en = datasets.load_dataset('stsb_multi_mt', 'en', split='test')
it = datasets.load_dataset('stsb_multi_mt', 'it', split='test')
en, it

In [62]:
en[0]

{'sentence1': 'A girl is styling her hair.',
 'sentence2': 'A girl is brushing her hair.',
 'similarity_score': 2.5}

In [63]:
it[0]

{'sentence1': 'Una ragazza si acconcia i capelli.',
 'sentence2': 'Una ragazza si sta spazzolando i capelli.',
 'similarity_score': 2.5}

In [64]:
en = en.map(lambda x: {'similarity_score': x['similarity_score']/5.0})
it = it.map(lambda x: {'similarity_score': x['similarity_score']/5.0})

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [65]:
en

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [66]:
it

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [68]:
from sentence_transformers import InputExample

en_samples = []
it_samples = []
en_it_samples = []

for i in range(len(en)):
  en_samples.append(InputExample(texts=[en[i]['sentence1'], en[i]['sentence2']], label=en[i]['similarity_score']))
  it_samples.append(InputExample(texts=[it[i]['sentence1'], it[i]['sentence2']], label=it[i]['similarity_score']))
  en_it_samples.append(InputExample(texts=[en[i]['sentence1'], it[i]['sentence2']], label=en[i]['similarity_score']))

In [71]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

en_eval = EmbeddingSimilarityEvaluator.from_input_examples(en_samples)
it_eval = EmbeddingSimilarityEvaluator.from_input_examples(it_samples)
en_it_eval = EmbeddingSimilarityEvaluator.from_input_examples(en_it_samples)

In [72]:
model = SentenceTransformer('./xlmr-ted')

In [None]:
%%time
en_eval(model)

In [None]:
%%time
it_eval(model)

In [None]:
%%time
en_it_eval(model)

In [None]:
from sentence_transformers import models

xlmr = models.Transformer('xlm-roberta-base')
pooler = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

student = SentenceTransformer(modules=[xlmr, pooler])

In [None]:
en_eval(student)

In [None]:
it_eval(student)

In [None]:
en_it_eval(student)