# Finetune ruGPT3Small on essays

## Подготавливаем окружение

In [None]:
!pip3 install urllib3==1.25.4

In [None]:
!pip3 install transformers==2.8.0

In [None]:
!wget https://raw.githubusercontent.com/buvanenko/finetune_rugpt3/main/pretrain_transformers.py

In [None]:
!wget https://raw.githubusercontent.com/buvanenko/finetune_rugpt3/main/generate_transformers.py

In [None]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
!sh setup.sh

## Пихаем данные для настройки
для примера остстроим модель на датасете с школьными сочинениями

In [None]:
!wget https://raw.githubusercontent.com/buvanenko/finetune_rugpt3/main/essays.txt

In [None]:
data_path = "essays.txt"

## Подготавливаем данные для настройки

In [None]:
import numpy as np
import random
import nltk

with open(data_path, "r", encoding="utf-8") as file:
    text = file.read()

valid_size = 5

topics = []
all_essays = []
for line in text.split("</s>"):
    if "Тема:" in line and "Сочинение:" in line:
        essay_text = line.split("Сочинение:")
        if len(essay_text) == 2:
            topic = essay_text[0].replace("<s>", " ").replace("</s>", " ").strip()
            essay_text = essay_text[1].replace("<s>", " ").replace("</s>", " ").strip()
            essay_text = f"Сочинение: {essay_text}"
            essay_res = f"<s>{topic}\n{essay_text}</s>"
            all_essays.append(essay_res)
            topics.append(topic)

random.seed(1234)
np.random.seed(1234)

unique_topics = list(set(topics))

valid_topics = []

for _ in range(valid_size):
    # Use randint for more speed (on big lists it is faster)
    idx = np.random.randint(0, len(unique_topics))
    valid_topics.append(unique_topics[idx])

train = []
valid = []
for topic, essay in zip(topics, all_essays):
    is_train = True
    for valid_topic in valid_topics:
        if (
            nltk.edit_distance(valid_topic, topic[:len(valid_topic)]) < 20 or
            nltk.edit_distance(valid_topic[:len(topic)], topic) < 20 or
            nltk.edit_distance(valid_topic[len(topic):], topic) < 20 or
            nltk.edit_distance(valid_topic, topic[len(valid_topic):]) < 20
            ):
            is_train = False
    if is_train:
        train.append(essay)
    else:
        valid.append(essay)

print(f'{len(valid)}, {len(train)}')

with open("train.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(train))

with open("valid.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(valid))

## Начинаем настраивать модель
Этот код скачает и настроит модель. Получившаяся моделька будет сохранена в папке, указанной в аргументе output_dir.

In [None]:
!python pretrain_transformers.py \
    --output_dir=essays_model \
    --model_type=gpt2 \
    --model_name_or_path=sberbank-ai/rugpt3small_based_on_gpt2 \
    --do_train \
    --train_data_file=train.txt \
    --do_eval \
    --fp16 \
    --eval_data_file=valid.txt \
    --per_gpu_train_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --num_train_epochs 5 \
    --block_size 2048 \
    --overwrite_output_dir

## Проверяем!

In [None]:
!python generate_transformers.py \
    --model_type=gpt2 \
    --model_name_or_path=essays_model \
    --k=5 \
    --p=0.95 \
    --length=500