In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 28.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


## Обучение модели для ответов на медицинские запросы.
Датасет: https://www.kaggle.com/datasets/coolonce/recipes-and-interpretation-dim

Модель: GPT2

In [3]:
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

In [4]:
device = "cuda:0" 

In [5]:
df_rec = pd.read_csv('/content/drive/MyDrive/NLP_course_project/all_recepies_inter.csv', sep='\t')
df_rec.head()

Unnamed: 0.1,Unnamed: 0,name,composition,cooking_type,Инструкции,dish_type,Дата,photo,source,composition_inter
0,0,рассольник классический с перловкой и солеными...,"[{'Перловка': 0.1, 'unit': 'стак. (200 мл)'}, ...","варка,жарка",Подготовить указанные ингредиенты для приготов...,первое,05.06.2015,photo_1000menu_1.jpg,https://1000.menu/cooking/33395-rassolnik-s-pe...,"[{'product_id': 4253, 'name_source': 'Перловая..."
1,1,Суп пюре из белокочаной капусты,"[{'Капуста белокочанная': 50.0, 'unit': 'гр'},...",варка,"Необходимые ингредиенты\r\nНарезаем лук, морко...",первое,27.06.2015,photo_1000menu_2.jpg,https://1000.menu/cooking/25399-sup-pure-iz-be...,"[{'product_id': 2286, 'name_source': 'Капуста ..."
2,2,Постные щи из квашеной капусты,"[{'Капуста квашеная': 116.7, 'unit': 'гр'}, {'...","варка,жарка,тушение","Честно признаюсь, у меня не было репы на момен...",первое,12.02.2013,photo_1000menu_3.jpg,https://1000.menu/cooking/5159-postnje-shchi,"[{'product_id': 0, 'name_source': 'Капуста ква..."
3,3,Тюря- простой суп быстро и вкусно,"[{'Квас': 0.2, 'unit': 'л'}, {'Лук репчатый': ...",сырое,"\r\nНачинаем мы приготовление тюри с того, что...",первое,02.03.2011,photo_1000menu_4.jpg,https://1000.menu/cooking/5085-turya,"[{'product_id': 0, 'name_source': 'Квас', 'uni..."
4,4,Фасолевый суп из красной фасоли,"[{'Вода': 0.3, 'unit': 'л'}, {'Картошка': 0.3,...",варка,Подготовить ингредиенты. Для приготовления суп...,первое,28.01.2013,photo_1000menu_5.jpg,https://1000.menu/cooking/38765-fasolevyi-sup-...,"[{'product_id': 828, 'name_source': 'Вода', 'u..."


In [6]:
def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w', encoding='utf-8')
    data = ''
    for texts in data_json:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

In [7]:
train_rec, test_rec = train_test_split(df_rec['Инструкции'], test_size=0.15)

build_text_files(train_rec,'train_rec.txt')
build_text_files(test_rec,'test_rec.txt')

In [8]:
tokenizer_rec = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
model_rec = AutoModelForCausalLM.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")

Downloading config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

In [9]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

In [10]:
train_dataset_rec, test_dataset_rec, data_collator_rec = load_dataset('train_rec.txt',
                                                          'test_rec.txt',
                                                          tokenizer_rec)



In [11]:
training_args = TrainingArguments(
    output_dir="gpt2_rec", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=10000, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

In [12]:
trainer_rec = Trainer(
    model=model_rec,
    args=training_args,
    data_collator=data_collator_rec,
    train_dataset=train_dataset_rec,
    eval_dataset=test_dataset_rec
)

In [13]:
trainer_rec.train()

***** Running training *****
  Num examples = 33972
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 42465


Step,Training Loss
500,2.4883
1000,2.4276
1500,2.3622
2000,2.3338
2500,2.3151
3000,2.2954
3500,2.2753
4000,2.257
4500,2.2407
5000,2.2097


Saving model checkpoint to gpt2_rec/checkpoint-10000
Configuration saved in gpt2_rec/checkpoint-10000/config.json
Model weights saved in gpt2_rec/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to gpt2_rec/checkpoint-20000
Configuration saved in gpt2_rec/checkpoint-20000/config.json
Model weights saved in gpt2_rec/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to gpt2_rec/checkpoint-30000
Configuration saved in gpt2_rec/checkpoint-30000/config.json
Model weights saved in gpt2_rec/checkpoint-30000/pytorch_model.bin
Saving model checkpoint to gpt2_rec/checkpoint-40000
Configuration saved in gpt2_rec/checkpoint-40000/config.json
Model weights saved in gpt2_rec/checkpoint-40000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=42465, training_loss=1.9394723935106484, metrics={'train_runtime': 8298.6643, 'train_samples_per_second': 20.468, 'train_steps_per_second': 5.117, 'total_flos': 1.109576613888e+16, 'train_loss': 1.9394723935106484, 'epoch': 5.0})

In [14]:
def get_recipe(text):
  prefix = text
  tokens = tokenizer_rec(prefix, return_tensors='pt').to(device)

  size = tokens['input_ids'].shape[1]
  output = model_rec.generate(
    **tokens, 
    #end_token=end_token_id,
    do_sample=False,
    max_length=100, 
    repetition_penalty=5., 
    temperature=0.5,
    num_beams=10,
    length_penalty=0.1    
  )

  return tokenizer_rec.decode(output[0])

get_recipe('Жареная картошка с мясом')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Жареная картошка с мясом и овощами.  1. Разогрейте духовку до 180 градусов. Смажьте маслом форму для маффинов или положите в каждое отверстие по бумажной формочке.  2. В миске миксером взбейте размягченное сливочное масло, сахар и ваниль до состояния крема. Добавьте яичные желтки и снова хорошо взбейте. Затем вмешайте сухие ингредиенты (муку, разрыхлитель'

In [15]:
tokenizer_rec.save_pretrained('/content/drive/MyDrive/NLP_course_project/tokenizer_rec')
model_rec.save_pretrained('/content/drive/MyDrive/NLP_course_project/model_rec')

tokenizer config file saved in /content/drive/MyDrive/NLP_course_project/tokenizer_rec/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP_course_project/tokenizer_rec/special_tokens_map.json
Configuration saved in /content/drive/MyDrive/NLP_course_project/model_rec/config.json
Model weights saved in /content/drive/MyDrive/NLP_course_project/model_rec/pytorch_model.bin
