In [1]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, MarianMTModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, GPT2Tokenizer
from datasets import  Dataset, load_dataset
import pandas as pd
import torch, gc
import numpy as np
import evaluate
import os
import mlflow

2023-06-03 12:21:28.451392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-03 12:21:28.905844: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:
2023-06-03 12:21:28.905895: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:


In [2]:
model = T5ForConditionalGeneration.from_pretrained('ai-forever/ruT5-base')
tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruT5-base')

In [3]:
raw_dataset = load_dataset('IlyaGusev/ru_turbo_alpaca')

raw_dataset = raw_dataset.remove_columns(['input', 'alternative_output', 'label', 'all_labels', 'agreement', 'overlap'])
raw_dataset = raw_dataset['train'].train_test_split(0.2, 0.8)

No config specified, defaulting to: ru_turbo_alpaca/default
Found cached dataset ru_turbo_alpaca (/home/alan-robotics/.cache/huggingface/datasets/IlyaGusev___ru_turbo_alpaca/default/0.0.1/a2a1f5b065b9e34022f6bc402785c2f5fa791930917ce4f1b8d4e634def7496d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
def tokenized_function(example):
    tokenized_snippet = tokenizer(example['output'], truncation=True, max_length=512)
    example['labels'] = tokenized_snippet['input_ids']
    return tokenizer(example['instruction'], truncation=True, max_length=512)


dataset = raw_dataset.map(tokenized_function, batched=True)

dataset = dataset.remove_columns(['instruction', 'output'])

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [5]:
gc.collect()
torch.cuda.empty_cache()

In [6]:
bleu = evaluate.load('bleu')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    #prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    #result["gen_len"] = np.mean(prediction_lens)
    #{k: round(v, 4) for k, v in result.items()}
    return result

In [7]:
os.environ["MLFLOW_EXPERIMENT_NAME"] = "ruT5-trainer"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

In [8]:
batch_size = 4
num_train_epochs = 2
logging_steps = len(dataset['train']) // batch_size


args = Seq2SeqTrainingArguments(
    output_dir='ruT5-finetuned',
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True
    )

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    data_collator,
    dataset['train'],
    dataset['test'],
    tokenizer,
    compute_metrics=compute_metrics
)

import os
os.environ["WANDB_DISABLED"] = "true"

In [12]:
trainer.train()

  0%|          | 0/11930 [00:00<?, ?it/s]

{'loss': 2.3775, 'learning_rate': 0.0, 'epoch': 0.08}
{'loss': 2.3516, 'learning_rate': 0.0, 'epoch': 0.17}
{'loss': 2.3957, 'learning_rate': 0.0, 'epoch': 0.25}
{'loss': 2.3747, 'learning_rate': 0.0, 'epoch': 0.34}
{'loss': 2.3941, 'learning_rate': 0.0, 'epoch': 0.42}
{'loss': 2.4017, 'learning_rate': 0.0, 'epoch': 0.5}
{'loss': 2.3643, 'learning_rate': 0.0, 'epoch': 0.59}
{'loss': 2.3541, 'learning_rate': 0.0, 'epoch': 0.67}
{'loss': 2.3853, 'learning_rate': 0.0, 'epoch': 0.75}
{'loss': 2.4126, 'learning_rate': 0.0, 'epoch': 0.84}
{'loss': 2.3964, 'learning_rate': 0.0, 'epoch': 0.92}


  0%|          | 0/1492 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.4204769548268746, 0.16815782024933237, 0.09123486939902314, 0.053833417320567276]" of type <class 'list'> for key "eval_precisions" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[0.4204769548268746, 0.16815782024933237, 0.09123486939902314, 0.053833417320567276]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 2.1888267993927, 'eval_bleu': 0.012522647648255015, 'eval_precisions': [0.4204769548268746, 0.16815782024933237, 0.09123486939902314, 0.053833417320567276], 'eval_brevity_penalty': 0.09173353783512943, 'eval_length_ratio': 0.2950838529384898, 'eval_translation_length': 87220, 'eval_reference_length': 295577, 'eval_runtime': 279.6369, 'eval_samples_per_second': 21.331, 'eval_steps_per_second': 5.335, 'epoch': 1.0}
{'loss': 2.3695, 'learning_rate': 0.0, 'epoch': 1.01}
{'loss': 2.3983, 'learning_rate': 0.0, 'epoch': 1.09}
{'loss': 2.3917, 'learning_rate': 0.0, 'epoch': 1.17}
{'loss': 2.4065, 'learning_rate': 0.0, 'epoch': 1.26}
{'loss': 2.4005, 'learning_rate': 0.0, 'epoch': 1.34}
{'loss': 2.377, 'learning_rate': 0.0, 'epoch': 1.42}
{'loss': 2.3727, 'learning_rate': 0.0, 'epoch': 1.51}
{'loss': 2.3663, 'learning_rate': 0.0, 'epoch': 1.59}
{'loss': 2.3635, 'learning_rate': 0.0, 'epoch': 1.68}
{'loss': 2.3841, 'learning_rate': 0.0, 'epoch': 1.76}
{'loss': 2.3749, 'learning_rat

  0%|          | 0/1492 [00:00<?, ?it/s]

Trainer is attempting to log a value of "[0.4204769548268746, 0.16815782024933237, 0.09123486939902314, 0.053833417320567276]" of type <class 'list'> for key "eval_precisions" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[0.4204769548268746, 0.16815782024933237, 0.09123486939902314, 0.053833417320567276]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 2.1888267993927, 'eval_bleu': 0.012522647648255015, 'eval_precisions': [0.4204769548268746, 0.16815782024933237, 0.09123486939902314, 0.053833417320567276], 'eval_brevity_penalty': 0.09173353783512943, 'eval_length_ratio': 0.2950838529384898, 'eval_translation_length': 87220, 'eval_reference_length': 295577, 'eval_runtime': 275.6395, 'eval_samples_per_second': 21.641, 'eval_steps_per_second': 5.413, 'epoch': 2.0}
{'train_runtime': 1696.2786, 'train_samples_per_second': 28.129, 'train_steps_per_second': 7.033, 'train_loss': 2.382459232813155, 'epoch': 2.0}


TrainOutput(global_step=11930, training_loss=2.382459232813155, metrics={'train_runtime': 1696.2786, 'train_samples_per_second': 28.129, 'train_steps_per_second': 7.033, 'train_loss': 2.382459232813155, 'epoch': 2.0})

In [14]:
mlflow.end_run() 

In [None]:
%%sh
git config --global user.email "alanrbtx@gmail.com"
git config --global user.name "alanrbtx"
git add mlruns
git commit -m 'Add MLFlow run'
git push https://github.com/alanrbtx/AIsaacChat.git

In [17]:
sentence = tokenizer('Как приготовить лазанью пошагово?', return_tensors='pt').to('cuda')

res = model.generate(**sentence, max_length=100, early_stopping=True)
print(tokenizer.decode(res[0], skip_special_tokens=True))

Для приготовления лазаньи вам понадобятся: - 2 яйца - 2 ст. ложки оливкового масла - 2 ст. ложки оливкового масла - 2 ст. ложки оливкового масла - 2 ст. ложки оливкового масла - 1 ст. ложка оливкового масла - 1 ст. ложка оливкового масла - 1 ст. ложка оливкового масла - 1 ст. ложка оливкового масла - 1 ст. ложка оливков


In [None]:
model.push_to_hub('AlanRobotics/instruct-T5')
tokenizer.push_to_hub('AlanRobotics/instruct-T5')