In [28]:
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import pickle
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, NllbTokenizer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from tqdm.auto import trange
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Tartu

In [4]:
def get_tartu_trans(text):
    headers = {
        'accept': 'application/json',
        'Content-Type': 'application/json',
    }
    json_data = {
        'text': text,
        'src': 'olo',
        'tgt': 'rus',
        'domain': 'general',
        'application': 'Documentation UI',
    }
    response = requests.post('https://api.tartunlp.ai/translation/v2', headers=headers, json=json_data)
    return response.json()['result']

In [5]:
get_tartu_trans('Antarktidan pinnan keskikorgevus on kaikkien manderien suurin.')

'Средняя высота поверхности Антарктиды - самая высокая среди всех континентов.'

In [11]:
file_path = os.path.join('data', 'corpus_to_label.txt')
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [15]:
N = 12000
tartu_translates = list()

In [16]:
for line_id in tqdm(range(N)):
    text = lines[line_id].strip()
    if len(text.split()) != 1:
        try:
            trans = get_tartu_trans(text)
            tartu_translates.append(trans)
        except:
            print(f'Error on {line_id}')
            break
    if len(tartu_translates) >= 10000:
        break

  0%|          | 0/12000 [00:00<?, ?it/s]

  0%|          | 0/12000 [00:42<?, ?it/s]

Error on 0





# GPU

In [30]:
with open(os.path.join('data','saved_dictionary.pkl'), 'rb') as f:
    kar_to_rus = pickle.load(f)
dict_embeds = np.load('data/dict_embeds.npy')
dict_keys = list(kar_to_rus.keys())

In [20]:
NLLB_NEW_PATH = 'weights/nllb/'
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_NEW_PATH)
nllb_tokenizer = NllbTokenizer.from_pretrained(NLLB_NEW_PATH)

MBART_PATH = 'weights/mbart/'
mbart_model = MBartForConditionalGeneration.from_pretrained(MBART_PATH)
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(MBART_PATH)

In [45]:

def get_embed(text, model, tokenizer, src_lang, tgt_lang):
    t = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.inference_mode():
        res = model.generate(**t, 
                              return_dict_in_generate=True, 
                              forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
                              output_hidden_states=True)
        per_token_embeddings = res['encoder_hidden_states'][-1]
        mask = t.attention_mask
        embeddings = (per_token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1)
        # normalize
        embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.squeeze(-1).cpu().numpy()

def batched_embed(texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(np.array(get_embed(texts2[i: i+batch_size], **kwargs)))
    return np.array([p for i, p in sorted(zip(idxs, results))])

def translate(
    text, 
    model, tokenizer,
    src_lang='rus_Cyrl', tgt_lang='eng_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=4, 
    **kwargs
):
    """Turn a text or a list of texts into a list of translations"""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    )
    model.eval() # turn off training mode
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

def batched_translate(texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]

def get_dict_translate(text):
    text_split = text.split() 
    if len(text_split) > 1:
        embeds = batched_embed(text_split, 
                                    model=nllb_model, 
                                    tokenizer=nllb_tokenizer, 
                                    src_lang='olo_Latn', 
                                    tgt_lang='rus_Cyrl')
    else:
        embeds = get_embed(text_split,
                                model=nllb_model, 
                                tokenizer=nllb_tokenizer, 
                                src_lang='olo_Latn', 
                                tgt_lang='rus_Cyrl')
    if embeds.ndim == 1: # если одномерный массив
        embeds = embeds.reshape(1,-1)
    ids_closest = cosine_similarity(embeds, dict_embeds).argmax(axis=1)
    kar_words = [kar_to_rus[dict_keys[id_closest]] for id_closest in ids_closest]
    pairs = list(zip(text_split, kar_words))
    return pairs

def get_translates(texts):
    if isinstance(texts, str):
        nllb_trans = translate(texts, nllb_model, nllb_tokenizer, src_lang='olo_Latn', tgt_lang='rus_Cyrl')
        mbart_trans = translate(texts, mbart_model, mbart_tokenizer, src_lang='fi_FI', tgt_lang='ru_RU')
        pairs = get_dict_translate(texts)
    else:
        nllb_trans = batched_translate(texts, 16, model=nllb_model, tokenizer=nllb_tokenizer, 
                                       src_lang='olo_Latn', tgt_lang='rus_Cyrl')
        mbart_trans = batched_translate(texts, 16, model=mbart_model, tokenizer=mbart_tokenizer, 
                                src_lang='fi_FI', tgt_lang='ru_RU')
        pairs = list()
        for text in texts:
            pairs.append(get_dict_translate(text))
    return nllb_trans, mbart_trans, pairs

In [47]:
orig_sentence = lines[0].strip()
sentences = lines[0:3]
sentences

['Nevvostoliitos kehitettih omaluaduine, pehmei taba kuvata liikehty, kudamas tuli muan animacien tunnusmerki.\n',
 'Animaciitehniekkoi.\n',
 'Animaciitehniekkoi on monenluadustu da niilöi voi sežo yhtistellä.\n']

In [48]:
nllb, mbart, pairs = get_translates(sentences)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [68]:
new_lines = [line.strip() for line in lines if len(line.split()) != 1]

In [None]:
N = 110
nllb_trans, mbart_trans, pairs_trans = list(), list(), list()
batch_size = 16
for line_id in tqdm(range(0, N, batch_size)):
    sentences = new_lines[line_id:line_id+batch_size]
    nllb, mbart, pairs = get_translates(sentences)
    nllb_trans.extend(nllb)
    mbart_trans.extend(mbart)
    pairs_trans.extend(pairs)
    if len(nllb_trans) >= 10000:
        break

In [61]:
res = pd.DataFrame([nllb, mbart, pairs]).T
res.columns = ['NLLB', 'MBART', 'Pairs']
res

Unnamed: 0,NLLB,MBART,Pairs
0,"В бракосочетании разработали своеобразный, мяг...","В совете вырабатывали своеобразный, мягкий обр...","[(Nevvostoliitos, кон — кон (учреждение)), (ke..."
1,Красные приспособления.,runrunrunсерсерваторы.,"[(Animaciitehniekkoi., теплотехник)]"
2,"Животные инженеры - многообразные, и их также ...","runника есть многоразовая, и её можно тоже сое...","[(Animaciitehniekkoi, теплотехник), (on, есть ..."
