### Проверка работы модели на новых данных, полученных в результате парсинга онлайн библиотеки

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW

In [5]:
translate_prefix = 'translate ru-en: ' # The prefix for the translation task
faculty_classification_prefix = 'classify_faculty: ' # The prefix for the task of defining the faculty
direction_classification_prefix = 'classify_direction: ' # The prefix for the task of determining the direction

optimal_max_length = 100

In [3]:
pip install transformers==4.45.2

Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.2)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
  

In [6]:
def generate(model, tokenizer, input_text):
# Translates the input_txt sentence from Russian to English
    model.eval()
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=optimal_max_length)
    output_tokens = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=100,
        temperature=0.7,
        top_p=0.9,
        num_beams=5,
        top_k=50,
        num_return_sequences=1
    )
    translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return translation

In [7]:
def translate_excel(model, tokenizer, df):
# Translates DataFrame df and returns an array of translated strings
    translations = []
    for i in range(len(df)):
        src = df.iloc[i]['name']
        input_text = translate_prefix + src
        translation = generate(model, tokenizer, input_text)
        translations.append(translation)
    return translations

In [8]:
transformers.__version__

'4.45.2'

In [9]:
model_name = '/content/multiT5-3tasks-titles_scientific_articles.pth'
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
model = torch.load(model_name, map_location=torch.device('cpu'))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  model = torch.load(model_name, map_location=torch.device('cpu'))


In [10]:
test_csv = pd.read_csv('/content/titles_scientific_articles_Leninka.csv')

In [11]:
test_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    20000 non-null  object
dtypes: object(1)
memory usage: 156.4+ KB


In [15]:
rus_title = test_csv.values.tolist()[:size]
rus_title = [title for sublist in rus_title for title in sublist]

In [13]:
size = 10
eng_trans = translate_excel(model, tokenizer, test_csv[:size])



Вывод результатов в DataFrame

In [16]:
df_trans = pd.DataFrame({'name': rus_title,
                         'translation': eng_trans})
df_trans

Unnamed: 0,name,translation
0,"Онкоиммунология, гемобластозы","Oncoimmunology, hemoblastosis"
1,Изменение функции гипофиз-гонадной системы у б...,Changes in hypophysic-gonad system function in...
2,Влияние некоторых фармакологических препаратов...,Effects of some pharmacological drugs on neuro...
3,Повышение малыми дозами тиреоидных гормонов ус...,Improving small doses of thyroid hormones to r...
4,Потребление ГЛС в рамках программы ОНЛС в 2008 г.,GLS consumption within the framework of the UN...
5,Экспрессия селенсодержащей глутатионпероксидаз...,Expression of selenium-containing glutathion p...
6,Нейроиммунологические аспекты патогенеза детск...,Neuroimmunological aspects of pathogenesis of ...
7,Проблемы внедрения достижений фармакогеномики,Introducing pharmacogenomics achievements: cur...
8,"Поиск и изучение микробных субстанций, ингибир...",Search and study of microbial substrates inhib...
9,Андроген-зависимое влияние м-холинолитика мета...,Androgen-dependent effect of metamisile m-chol...


In [24]:
df_trans.to_numpy().tolist()

[['Онкоиммунология, гемобластозы', 'Oncoimmunology, hemoblastosis'],
 ['Изменение функции гипофиз-гонадной системы у больных хорионкарциномой матки под влиянием аутогемохимиотерапии',
  'Changes in hypophysic-gonad system function in patients with chorion carcinoma under the influence of autohemochymic therapy'],
 ['Влияние некоторых фармакологических препаратов на активность ферментов обмена нейропептидов при стрессе',
  'Effects of some pharmacological drugs on neuropeptide exchange enzymes in stress'],
 ['Повышение малыми дозами тиреоидных гормонов устойчивости организма к стрессорным воздействиям различной интенсивности и сложности',
  'Improving small doses of thyroid hormones to resistance to stressors of various intensity and complexity'],
 ['Потребление ГЛС в рамках программы ОНЛС в 2008 г.',
  'GLS consumption within the framework of the UNLS programme in 2008'],
 ['Экспрессия селенсодержащей глутатионпероксидазы при канцерогенном действии тетрахлорметана',
  'Expression of se