# Модель Transformer-1

#### Задание
1. Взять предобученную трансформерную архитектуру и решить задачу перевода


In [1]:
# Установка
#!conda install -c conda-forge transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Импорт библиотек

import tensorflow as tf
import transformers
from transformers import pipeline

#### Перевод с английского на немецкий

In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# Загрузка датасета

from datasets import load_dataset

dataset = load_dataset("opus_books", lang1="de", lang2="en")

Using custom data configuration de-en-lang1=de,lang2=en
Reusing dataset opus_books (/root/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 51467
    })
})

In [6]:
# Разбивка данных на тренировочную и тестовую выборки

split_datasets = dataset["train"].train_test_split(train_size=0.9, seed=17)
split_datasets

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-1f89219ca31eceda.arrow and /root/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-e423a698f3d1a099.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 46320
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 5147
    })
})

In [7]:
# Пример данных

split_datasets["train"][1]["translation"]

{'de': '»Sie werden die ägyptischen Pyramiden hinaufklettern!« murmelte er. »Aber annoncieren Sie nur immer auf Ihre eigene Gefahr hin!',
 'en': '"You shall walk up the pyramids of Egypt!" he growled. "At your peril you advertise!'}

In [8]:
# Загрузка пайплайна

from transformers import pipeline

translator = pipeline("translation_en_to_de")

No model was supplied, defaulted to t5-base (https://huggingface.co/t5-base)
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
# Проверка работы пайплайна

translator("Where are you now?")

[{'translation_text': 'Wo sind Sie jetzt?'}]

In [10]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
# Загрузка предобученного токенайзера

from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

Downloading:   0%|          | 0.00/750k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]



In [13]:
tokenizer.tokenize('Where are you now?')

['▁Where', '▁are', '▁you', '▁now', '?']

In [14]:
tokenizer.tokenize('Wo bist du jetzt?')

['▁Wo', '▁bis', 't', '▁du', '▁jet', 'z', 't', '?']

In [15]:
print(tokenizer('Where are you now?')['input_ids'])

[1653, 48, 41, 280, 31, 0]


In [16]:
print(tokenizer('Wo bist du jetzt?')['input_ids'])

[1502, 159, 46, 143, 21054, 239, 46, 31, 0]


In [19]:
tokenizer.decode([1653, 48, 41, 280, 31, 0])

'▁Where are you▁now?'

In [21]:
tokenizer.decode([1502, 159, 46, 143, 21054, 239, 46, 31, 0])

'Wo bist du▁jetzt?'

In [22]:
split_datasets["train"][3]

{'id': '33766',
 'translation': {'de': 'Sind Sie nicht wohl?« fragte er auf französisch, indem er zu ihr hintrat.',
  'en': "Aren't you well?' he said in French as he came up to her."}}

In [23]:
# Разбивка данных по языкам и токенизация

en_sentence = split_datasets["train"][3]["translation"]["en"]
de_sentence = split_datasets["train"][3]["translation"]["de"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(de_sentence)
targets

{'input_ids': [5044, 42, 51, 1841, 31, 2112, 12515, 110, 37, 29724, 2, 1937, 110, 24, 284, 922, 14190, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
# Препроцессинг

max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["de"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)#, padding='max_length')

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)#, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)



  0%|          | 0/47 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [26]:
tokenized_datasets["train"][3]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [29285,
  22,
  46,
  41,
  251,
  31,
  22,
  137,
  466,
  5,
  1572,
  52,
  137,
  1295,
  150,
  12,
  249,
  3,
  0],
 'labels': [5044,
  42,
  51,
  1841,
  31,
  2112,
  12515,
  110,
  37,
  29724,
  2,
  1937,
  110,
  24,
  284,
  922,
  14190,
  3,
  0]}

In [27]:
# Загрузка предобученной модели

from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)
# from transformers import AutoConfig,TFAutoModelForSeq2SeqLM

# # Download configuration from huggingface.co and cache.
# config = AutoConfig.from_pretrained("t5-base")
# model = TFAutoModelForSeq2SeqLM.from_config(config)

Downloading:   0%|          | 0.00/284M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [31]:
# Перевод

text = 'The greatest glory in living lies not in never falling, but in rising every time we fall'

inputs = tokenizer.encode(text, return_tensors="tf")
outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)

In [34]:
print(tokenizer.decode(outputs[0]))

<pad> Der größte Ruhm im Leben liegt nicht darin, nie zu fallen, sondern darin, jedes Mal aufzusteigen, wenn wir fallen<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
