In [1]:
from datasets import load_dataset, load_metric


In [2]:
dataset = load_dataset("europa_eac_tm", language_pair=("pl", "en"))

Using custom data configuration pl2en-0da2ec5e9ea613fc
Reusing dataset europa_eac_tm (/home/bartek/.cache/huggingface/datasets/europa_eac_tm/pl2en-0da2ec5e9ea613fc/0.0.0/955b2501a836c2ea49cfe3e719aec65dcbbc3356bbbe53cf46f08406eb77386a)


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation', 'sentence_type'],
        num_rows: 4027
    })
})

In [2]:
metric = load_metric("sacrebleu")

In [5]:
metric

Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: The system stream (a sequence of segments).
    references: A list of one or more reference streams (each a sequence of segments).
    smooth_method: The smoothing method to use. (Default: 'exp').
    smooth_value: The smoothing value. Only valid for 'floor' and 'add-k'. (Defaults: floor: 0.1, add-k: 1).
    tokenize: Tokenization method to use for BLEU. If not provided, defaults to 'zh' for Chinese, 'ja-mecab' for
        Japanese and '13a' (mteval) otherwise.
    lowercase: Lowercase the data. If True, enables case-insensitivity. (Default: False).
    force: Insist that your tokenized input is actually detokenized.

Returns:
    'score': BLEU score,
    'counts'

In [5]:
dataset['train'][2]

{'translation': {'en': "The grant application will be processed by computer. All personal data (such as names, addresses, CVs, etc.) will be processed in accordance with Regulation (EC) No 45/2001 of the European Parliament and of the Council of 18 December 2000 on the protection of individuals with regard to the processing of personal data by the Community institutions and bodies and on the free movement of such data. Information provided by the applicants necessary in order to assess their grant application will be processed solely for that purpose by the department responsible for the programme concerned. On the applicant's request, personal data may be sent to the applicant to be corrected or completed. Any question relating to these data, should be addressed to the appropriate Agency to which the form must be submitted. Beneficiaries may lodge a complaint against the processing of their personal data with the European Data Protection Supervisor at anytime.",
  'pl': 'Wniosek o dof

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [7]:
model_name = "Helsinki-NLP/opus-mt-pl-en"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
tokenizer.max_len_single_sentence

511

In [10]:
import os

In [74]:
if not os.path.exists('model_checkpoints/base_model/'):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
else:
    model = AutoModelForSeq2SeqLM.from_pretrained('model_checkpoints/base_model/')

In [75]:
model.save_pretrained('model_checkpoints/base_model')

In [95]:
# sample_input = dataset['train'][2]['translation']['pl']
# sample_output = dataset['train'][2]['translation']['en']
sample_input = "Jak masz na imię?"
sample_output = "What's your name?"
print(sample_input)
print(sample_output)

Jak masz na imię?
What's your name?


In [96]:
tokenizer.encode_plus(sample_input)

{'input_ids': [295, 669, 22, 2552, 7, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [97]:
print(tokenizer.encode_plus(sample_output))
with tokenizer.as_target_tokenizer():
    print(tokenizer.encode_plus(sample_output))

{'input_ids': [39, 6139, 6, 9, 15, 23, 1943, 22, 1326, 7, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [112, 6, 9, 79, 653, 7, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [98]:
tokenizer.vocab_size

63430

In [99]:
import torch

In [100]:
sample_input = tokenizer(sample_input, return_tensors='pt')
sample_output = tokenizer(sample_output, return_tensors='pt')

In [101]:
sample_input

{'input_ids': tensor([[ 295,  669,   22, 2552,    7,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [108]:
model.eval()
output = model(**sample_input, decoder_input_ids=sample_output.input_ids, labels=sample_output.input_ids)
# output = model.generate(**sample_input)

In [109]:
output = np.argmax(output.logits.detach().cpu().numpy()[0], axis=-1)

In [103]:
output

tensor([[63429,   112,     6,     9,    79,   653,     7,     0]])

In [84]:
output.logits

tensor([[[ 1.8370, -5.2278,  1.4809,  ..., -5.3075, -5.2583,  0.0000],
         [ 1.4559, -6.5947, -0.2945,  ..., -6.3565, -6.2591,  0.0000],
         [ 1.9986, -3.5359,  0.5771,  ..., -3.5303, -3.4807,  0.0000],
         ...,
         [ 0.3046, -8.3837, -0.0799,  ..., -8.3960, -8.4359,  0.0000],
         [10.1125, -3.1985,  3.5722,  ..., -3.1705, -3.2222,  0.0000],
         [ 4.3860, -5.4627,  3.3804,  ..., -5.1392, -4.9693,  0.0000]]],
       grad_fn=<AddBackward0>)

In [85]:
output = output.logits.detach().cpu().numpy()[0]

In [86]:
import numpy as np
output = np.argmax(output, axis=-1)

In [87]:
output = tokenizer.decode(output, skip_special_tokens=True)

In [88]:
sample_output

{'input_ids': tensor([[  39, 6139,    6,    9,   15,   23, 1943,   22, 1326,    7,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [89]:
output == sample_output

False

In [90]:
output

'W?S your?? name??'

In [None]:
model.forward