# Модель Transformer-1

#### Задание
1. Взять предобученную трансформерную архитектуру и решить задачу перевода
2. (дополнительная не обязательная задача) взять датасет из datasets для задачи классификации на русском языке затем взять модель которая предобучена на такой задачи классификации и замерить качество до обучения и после обучения на этом датасете

In [8]:
# Импорт библиотек

import tensorflow as tf
import transformers
from transformers import pipeline

#### Перевод с английского на немецкий

In [9]:
# Загрузка датасета

from datasets import load_dataset

dataset = load_dataset("opus_books", lang1="de", lang2="en")

Found cached dataset opus_books (/home/zia/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 51467
    })
})

In [11]:
# Разбивка данных на тренировочную и тестовую выборки

split_datasets = dataset["train"].train_test_split(train_size=0.9, seed=17)
split_datasets

Loading cached split indices for dataset at /home/zia/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-ddfbfb99a6bf2447.arrow and /home/zia/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf/cache-c2c940abbaa9f592.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 46320
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 5147
    })
})

In [12]:
# Пример данных

split_datasets["train"][1]["translation"]

{'de': '»Sie werden die ägyptischen Pyramiden hinaufklettern!« murmelte er. »Aber annoncieren Sie nur immer auf Ihre eigene Gefahr hin!',
 'en': '"You shall walk up the pyramids of Egypt!" he growled. "At your peril you advertise!'}

In [15]:
# Загрузка пайплайна

from transformers import pipeline

trans_lator= pipeline("translation_en_to_de")

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [16]:
# Проверка работы пайплайна

trans_lator("How are you?")

[{'translation_text': 'Wie sind Sie?'}]

In [17]:
# Загрузка предобученного токенайзера

from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

In [18]:
# Разбивка данных по языкам и токенизация

en_sentence = split_datasets["train"][1]["translation"]["en"]
de_sentence = split_datasets["train"][1]["translation"]["de"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(de_sentence)



In [19]:
# Препроцессинг

max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["de"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [20]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/46320 [00:00<?, ? examples/s]

Map:   0%|          | 0/5147 [00:00<?, ? examples/s]

In [21]:
# Загрузка предобученной модели

from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

Downloading pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

2023-05-28 17:44:48.554901: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-05-28 17:44:48.555209: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-28 17:44:48.555236: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ZIA): /proc/driver/nvidia/version does not exist
2023-05-28 17:44:48.556083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-28 17:44:49.001337: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 118990848 exceeds 10% o

In [22]:
# Перевод

text = 'As God is my witness, I’ll never be hungry again!'

inputs = tokenizer.encode(text, return_tensors="tf")
outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)

In [23]:
print(tokenizer.decode(outputs[0]))

<pad> Da Gott mein Zeuge ist, werde ich nie wieder hungrig sein!</s>


#### Классификация текстов

In [24]:
# Импорт библиотек, загрузка токенизатора и модели 

from transformers import TFAutoModelForSequenceClassification
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment-rurewiews', return_tensors="tf")
model = TFAutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment-rurewiews', return_dict=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/495 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some layers from the model checkpoint at blanchefort/rubert-base-cased-sentiment-rurewiews were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at blanchefort/rubert-base-cased-sentiment-rurewiews.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [26]:
# Загрузка пайплайна

from transformers import pipeline

classifier = pipeline("text-classification", tokenizer = tokenizer, model = model)

In [27]:
# Загрузка датасета

from datasets import load_dataset

dataset = load_dataset("blinoff/healthcare_facilities_reviews")

Downloading builder script:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/784 [00:00<?, ?B/s]

Downloading and preparing dataset healthcare_facilities_reviews/simple to /home/zia/.cache/huggingface/datasets/blinoff___healthcare_facilities_reviews/simple/1.0.0/d61498aa2f506f5e71bb46794c1b010c56c842dd03b36556cb67744a57dc916e...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/95.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset healthcare_facilities_reviews downloaded and prepared to /home/zia/.cache/huggingface/datasets/blinoff___healthcare_facilities_reviews/simple/1.0.0/d61498aa2f506f5e71bb46794c1b010c56c842dd03b36556cb67744a57dc916e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'title', 'sentiment', 'category', 'review_id', 'source_url', 'Idx'],
        num_rows: 70597
    })
    validation: Dataset({
        features: ['content', 'title', 'sentiment', 'category', 'review_id', 'source_url', 'Idx'],
        num_rows: 70597
    })
})

In [29]:
# Исключение лишниш столбцов

dataset = dataset.remove_columns(['title', 'category', 'review_id', 'source_url', 'Idx'])

In [30]:
# Энкодинг таргета

dataset = dataset.rename_column('sentiment', 'label')
dataset = dataset.class_encode_column('label')

Casting to class labels:   0%|          | 0/70597 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/70597 [00:00<?, ? examples/s]

In [31]:
# Разбивка данных на тренировочную и тестовую выборки

split_datasets = dataset["train"].train_test_split(train_size=0.9, seed=17)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'label'],
        num_rows: 63537
    })
    test: Dataset({
        features: ['content', 'label'],
        num_rows: 7060
    })
})

In [37]:
# Пример данных

print(split_datasets["train"][0],
      split_datasets["train"][1],
      split_datasets["train"][2])

{'content': 'При госпитализации в больницу могут предложить услуги посредника! Особенно наглый Корпорация семейной медицины! Сумма госпитализации завышена в разы ( говорят, что берут за курацию), договор дают не на госпитализацию, а общий! Потом еще требуют доплаты хотя в платном отделе больницы счет меньше! Ужас!', 'label': 0} {'content': 'Ужасное отношение, диагноз выдуман, снимок не смог нормально прочитать стоматолог. Не советую! Прием 15.08.15.', 'label': 0} {'content': 'В нашей семье случилась беда. Сын стал наркоманом. Мы узнали о клинике доктора Исаева случайно, т. к. не обладали никакой информацией по этому вопросу. Сына после клиники отправили в центр "Не зависимость", где с ним занималась Марьяна-мониторный психолог. Благодаря её профессионализму, самоотдаче, чётко проведённой методике, душевности. чуткости и заботе наш сын стал совершенно другим человеком. Нет слов, чтобы выразить от всего сердца благодарность Марьяне. Побольше бы таких высококлассных специалистов!Михаил, п

In [33]:
# Проверка работы предобученной модели

print(classifier(split_datasets["train"]['content'][0]),
      classifier(split_datasets["train"]['content'][1]),
      classifier(split_datasets["train"]['content'][2]))

[{'label': 'NEGATIVE', 'score': 0.8474008440971375}] [{'label': 'NEGATIVE', 'score': 0.9199402928352356}] [{'label': 'NEGATIVE', 'score': 0.7423567175865173}]


В одном случае из трех предобученная модель ошиблась

In [None]:
data = split_datasets["test"]["content"]
raw_predictions = classifier(data)

In [None]:
raw_predictions