<a href="https://colab.research.google.com/github/artem-konevskikh/random-colab-notebooks/blob/main/rugpt3_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RuGPT3 Finetuning and text generation

Made by [Artem Konevskikh](https://aiculedssul.net)

In [None]:
#@title Install transformers
!pip install transformers 
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

import torch
DEVICE = torch.device("cuda:0")

In [None]:
#@title Mount Google Drive
#@markdown Mount Google Drive to save/load finetuned models

from google.colab import drive
drive.mount('/content/drive')

---

# Finetuning

You can skip this part if you already have your model

In [None]:
#@title Load RuGPT3-small model
model_name_or_path = "sberbank-ai/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)

In [None]:
#@title Prepare dataset
dataset_path = "/content/dataset.txt" #@param {"type": "string"}

train_dataset = TextDataset(tokenizer=tokenizer,file_path=dataset_path,block_size=64)  
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [6]:
#@title Set Params
#@markdown The output directory where model will be saved (you can store it on the drive to reuse it later)
model_dir = "/content/drive/MyDrive/AI/rugpt3" #@param {"type": "string"}
#@markdown Overwrite the content of the output directory
overwrite_output_dir=True #@param {"type": "boolean"}
#@markdown Number of training epochs
num_train_epochs=80 #@param {"type": "integer"}
#@markdown Batch size for training
per_device_train_batch_size=32 #@param {"type": "integer"}
#@markdown Batch size for evaluation
per_device_eval_batch_size=32 #@param {"type": "integer"}
#@markdown Number of warmup steps for learning rate scheduler
warmup_steps=10 #@param {"type": "integer"}
#@markdown To make "virtual" batch size larger
gradient_accumulation_steps=16 #@param {"type": "integer"}
#@markdown Learning rate (set smaller learning rate for smaller datasets)
lr = 0.00001 #@param {type:"slider", min:1e-5, max:1e-4, step:4.5e-5}

training_args = TrainingArguments(
    output_dir="./output/", #The output directory
    overwrite_output_dir=overwrite_output_dir, #overwrite the content of the output directory
    num_train_epochs=num_train_epochs, # number of training epochs
    per_device_train_batch_size=per_device_train_batch_size, # batch size for training
    per_device_eval_batch_size=per_device_eval_batch_size,  # batch size for evaluation
    warmup_steps=warmup_steps,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=gradient_accumulation_steps, # to make "virtual" batch size larger
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=lr),None) # Optimizer and lr scheduler
)

In [None]:
#@title Run Finetuning
#@markdown This will run the finetuning and save the model after that
trainer.train()
trainer.save_model(model_dir)

---

# Generate with finetuned model

In [None]:
#@title Load finetuned model
#@markdown The directory where finetuned model is stored
model_dir = "/content/drive/MyDrive/AI/rugpt3" #@param {"type": "string"}


model_name_or_path = "sberbank-ai/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_dir).to(DEVICE)



In [None]:
#@title Generate
#@markdown Prompt to continue
text = '' #@param {"type": "string"}
#@markdown Max length of the generated text
max_length = 100 #@param {"type": "integer"}
#@markdown Temperature. Best results in range 0.8-2
temperature = 0.8  #@param {type:"slider", min:0, max:2, step:0.1}

input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
out = model.generate(input_ids, do_sample=True, temperature=1.3, max_length=30)
generated_text = list(map(tokenizer.decode, out))[0]
print(generated_text)