In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 35.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


## Обучение модели для ответов на медицинские запросы.
Модель: GPT2

Датасет: медицинские вопросы ответы

In [3]:
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

In [4]:
device = "cuda:0" 

In [5]:
df_med = pd.read_csv('/content/drive/MyDrive/NLP_course_project/medical_qa_ru_data.csv')
df_med.head()

Unnamed: 0,date,categ,theme,desc,ans,spec10
0,"8 Октября 2017, 11:55",Оториноларингология,Применение Ларипронта.,"Ларипронт 20 талеток,через каждые 2-3 часа.Оче...",Что вы им лечите? Длительность приема Ларипрон...,Отоларинголог
1,"20 Февраля 2019, 13:24",Акушерство,Беременность,"Здравствуйте, я на 7-8 неделе беременности. С ...","Здравствуйте, это может быть признаком раннего...",
2,"17 Марта 2015, 18:31",Другое,гинекология,Здравствуйте месячные должны придти 23 марта в...,Выполните исследование хгч,
3,"13 Января 2019, 19:38",Терапия,Занятия спорта после сдачи крови,"Завтра иду с утра сдавать кровь ТТГ, Т4СВ, Кал...","Можно.;\nЗдравствуйте , да, попейте сладкого ч...",Терапевт
4,"28 Ноября 2017, 21:58",Другое,Таблетки,Мне прописали пить Аллохол. Врач написала пить...,Препарат принимается после еды. Уточните это ...,


In [6]:
df_med['sum'] = df_med['desc'] + df_med['ans']

In [7]:
tokenizer_med = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
model_med = AutoModelForCausalLM.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")

Downloading config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

In [8]:
def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w', encoding='utf-8')
    data = ''
    for texts in data_json:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

In [9]:
train_med, test_med= train_test_split(df_med['sum'][:30000], test_size=0.15)

build_text_files(train_med,'train_med.txt')
build_text_files(test_med,'test_med.txt')

In [10]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

In [11]:
train_dataset_med, test_dataset_med, data_collator_med = load_dataset('train_med.txt',
                                                                      'test_med.txt',
                                                                      tokenizer_med)



In [12]:
training_args = TrainingArguments(
    output_dir="gpt2_med", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=10000, # after # steps model is saved
    warmup_steps=2000,# number of warmup steps for learning rate scheduler
    )

In [13]:
trainer_med = Trainer(
    model=model_med,
    args=training_args,
    data_collator=data_collator_med,
    train_dataset=train_dataset_med,
    eval_dataset=test_dataset_med
)

In [14]:
trainer_med.train()

***** Running training *****
  Num examples = 32117
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 40150


Step,Training Loss
500,3.8287
1000,3.6445
1500,3.5827
2000,3.5343
2500,3.5048
3000,3.5
3500,3.4675
4000,3.4329
4500,3.4347
5000,3.4158


Saving model checkpoint to gpt2_med/checkpoint-10000
Configuration saved in gpt2_med/checkpoint-10000/config.json
Model weights saved in gpt2_med/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to gpt2_med/checkpoint-20000
Configuration saved in gpt2_med/checkpoint-20000/config.json
Model weights saved in gpt2_med/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to gpt2_med/checkpoint-30000
Configuration saved in gpt2_med/checkpoint-30000/config.json
Model weights saved in gpt2_med/checkpoint-30000/pytorch_model.bin
Saving model checkpoint to gpt2_med/checkpoint-40000
Configuration saved in gpt2_med/checkpoint-40000/config.json
Model weights saved in gpt2_med/checkpoint-40000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=40150, training_loss=2.9717467174791308, metrics={'train_runtime': 7418.438, 'train_samples_per_second': 21.647, 'train_steps_per_second': 5.412, 'total_flos': 1.048989523968e+16, 'train_loss': 2.9717467174791308, 'epoch': 5.0})

In [22]:
def get_medicine(text):
  prefix = text
  tokens = tokenizer_med(prefix, return_tensors='pt').to(device)

  size = tokens['input_ids'].shape[1]
  output = model_med.generate(
    **tokens, 
    #end_token=end_token_id,
    do_sample=False,
    max_length=30, 
    repetition_penalty=5., 
    temperature=0.5,
    num_beams=10,
    length_penalty=1.5    
  )

  return tokenizer_med.decode(output[0])

get_medicine('Болит спина')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Болит спина,подскажите что это может быть?Возможен остеохондроз пояснично-крестцового отдела позвоночника,покажитесь'

In [23]:
tokenizer_med.save_pretrained('/content/drive/MyDrive/NLP_course_project/tokenizer_med')
model_med.save_pretrained('/content/drive/MyDrive/NLP_course_project/model_med')

tokenizer config file saved in /content/drive/MyDrive/NLP_course_project/tokenizer_med/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP_course_project/tokenizer_med/special_tokens_map.json
Configuration saved in /content/drive/MyDrive/NLP_course_project/model_med/config.json
Model weights saved in /content/drive/MyDrive/NLP_course_project/model_med/pytorch_model.bin
