In [None]:
!pip install peft
!pip install jsonlines

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import os
import re
import random
import shutil
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm


2024-03-09 17:24:58.944244: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-09 17:24:58.944353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-09 17:24:59.073650: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
device = "cuda"
model_name_or_path = "ai-forever/rugpt3medium_based_on_gpt2"
tokenizer_name_or_path = "ai-forever/rugpt3medium_based_on_gpt2"


max_length = 256
lr = 3e-5
num_epochs = 20
iter_steps = 5000
batch_size = 4
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

# Dataset

In [None]:
dataset = load_dataset("IlyaGusev/stihi_ru")

In [5]:
from collections import Counter
labels = dataset['train']['author']
label_counts = Counter(labels)


In [6]:
k = 10  
top_k = label_counts.most_common(k)
for value, count in top_k:
    print(f"{value}: {count}")

None: 927782
Носильщик: 10811
Зевс: 8477
Станислав Прохоренко: 6788
Анатолий Смоляр: 4466
Май Тирас: 4258
Феликс Кац: 4188
Халида Шариф: 3930
Сергей Носов 8: 3687
Качалов Игорь: 3651


In [7]:
dataset = dataset['train'][:100000]

In [8]:
train_data = []
test_data = []
for full_txt in dataset['text']:
    full_txt = '<s>'+full_txt+'</s>'
    if random.random()<0.1:
        for i in range(len(full_txt)//2000+1):
            txt = full_txt[2000*i:2000*(i+1)]
            tokenized_txt = tokenizer(txt)['input_ids']
            if len(tokenized_txt) > 100:
                test_data.append(tokenized_txt)
    else:
        for i in range(len(full_txt)//2000+1):
            txt = full_txt[2000*i:2000*(i+1)]
            tokenized_txt = tokenizer(txt)['input_ids']
            if len(tokenized_txt) > 100:
                train_data.append(tokenized_txt)

In [9]:
len(train_data), len(test_data)

(68694, 7545)

In [10]:
class MyDataLoader:
    def __init__(self, data1,  batch_size):
        self.batch_size = batch_size
        self.data =  data1
    
    def __iter__(self):
        return self
    
    def __next__(self):
        x = []
        y = []
        attention_mask = []
        
        batch = [self.data[i] for i in np.random.choice(len(self.data),self.batch_size)]
        
        for input_ids in batch:
            split_token = torch.randint(100, len(input_ids), (1,))
            x_item = input_ids[split_token-max_length:split_token]
            attention_mask_item = [0]*(max_length - len(x_item)) + [1]*len(x_item)
            x_item = [0]*(max_length - len(x_item)) + x_item
            x.append(x_item)
            attention_mask.append(attention_mask_item)
        
        output = {'input_ids': torch.tensor(x).to(device),
                  'attention_mask': torch.tensor(attention_mask).to(device)} 

        return output

    
train_dataloader = MyDataLoader(train_data, batch_size=batch_size)
test_dataloader = MyDataLoader(test_data, batch_size=batch_size)

# Training

In [None]:
peft_config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, num_virtual_tokens=15)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model = model.to(device)

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [13]:
batch  = next(iter(train_dataloader))
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch, labels=batch["input_ids"])
print(tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)[0])

['ustustustustustustustustustustustustustustustustustustustustustustustustabustustust Tra Tra Tra Tra Tra Tra Tra Tra Traйте Tra,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, с,,, с, с,, с,,,,,,,,,,,, с с,, с, с,,, -, с - - - - - -::::::::::::: к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к к кombombombomb кombombombombombombomb от от от от от от от от от от от,,,,,,,,,,,,,,,,, должны, должны должны должны должны должны должны должны должны должны должны \n не \n))) \n)))))))) \n \n \n))) \n']


In [14]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for _ in range(iter_steps):
        batch  = next(iter(train_dataloader))
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for _ in range(iter_steps):
        batch  = next(iter(test_dataloader))
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / (iter_steps)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / (iter_steps)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
    
    batch  = next(iter(train_dataloader))
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch, labels=batch["input_ids"])
    print(tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)[0])
    print()
    print()

epoch=0: train_ppl=tensor(6537.7910, device='cuda:0') train_epoch_loss=tensor(8.7854, device='cuda:0') eval_ppl=tensor(7.7637, device='cuda:0') eval_epoch_loss=tensor(2.0495, device='cuda:0')
.,,,, много,,,ник,,рину, себеслед, что
,али
, кто,,

ис,би,, не глазахпу,


,,ится, головойенно,
езда,,ёттиь,

 меняка,

неца,ел,
но,,

 в,
 с


 в


epoch=1: train_ppl=tensor(8.3800, device='cuda:0') train_epoch_loss=tensor(2.1258, device='cuda:0') eval_ppl=tensor(6.1479, device='cuda:0') eval_epoch_loss=tensor(1.8161, device='cuda:0')

,

,,
,
,лю,,рил,

 в, глазахет,у,
ить,,
И, и,
алила,,
Их, мной,

 вка,л
, глазахле,И тебя,ой,
Иалось,
,нув,И в, тобой.
И в дня,шьИ меняку,
И не мнойчною..Ижно меня.но
Иишь ври,ки




epoch=2: train_ppl=tensor(7.0820, device='cuda:0') train_epoch_loss=tensor(1.9576, device='cuda:0') eval_ppl=tensor(5.4332, device='cuda:0') eval_epoch_loss=tensor(1.6925, device='cuda:0')



epoch=3: train_ppl=tensor(6.3734, device='cuda:0') train_epoch_loss=tensor(1.8521, device='c

KeyboardInterrupt: 

In [None]:
from huggingface_hub import login
login('')

In [None]:
peft_model_id = "aanosov/rugpt3med_PREFIX_CAUSAL_cont256_20ep_5000st_15tokens_stihi"
model.push_to_hub(peft_model_id , use_auth_token=True)

# Testing

In [25]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
peft_model_id = "aanosov/rugpt3med_PREFIX_CAUSAL_cont256_20ep_5000st_15tokens_stihi"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.to(device)

In [26]:
sentence = "Прошла гроза, умылось небо,\nОт туч остались облака,\n"
context_tokens = tokenizer.encode(sentence, add_special_tokens=False)
context = torch.tensor(context_tokens, dtype=torch.long).to(device)
num_samples = 1
context = context.unsqueeze(0).repeat(num_samples, 1)
generated = context

length = 100
temperature = 0.8
with torch.no_grad():
    for _ in range(length):
        outputs = model(generated)
        next_token_logits = outputs[0][:, -1, :] / temperature
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 
        generated = torch.cat((generated, next_token), dim=1)

out = generated
out = out[:, len(context_tokens):].tolist()
for o in out:
    text = tokenizer.decode(o, clean_up_tokenization_spaces=True)

print(sentence+'|||'+text)

Прошла гроза, умылось небо,
От туч остались облака,
|||
Прошлась по морю вода,
На земле,
Озеро разобилось и лепить,
Темно, звёзды теперь бликует,
Эту ночь,
Остаётся пустеет там,
Все равно свет стоит,
И луна, и звёзды,
Врагу не найти,
Видно, что по ней амчуга
Находится наводнение,
И дырявый барометр
Пасмурно небо,
И


In [28]:
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = model.to(device)

In [29]:
sentence = "Прошла гроза, умылось небо,\nОт туч остались облака,\n"
context_tokens = tokenizer.encode(sentence, add_special_tokens=False)
context = torch.tensor(context_tokens, dtype=torch.long).to(device)
num_samples = 1
context = context.unsqueeze(0).repeat(num_samples, 1)
generated = context

length = 100
temperature = 0.8
with torch.no_grad():
    for _ in range(length):
        outputs = model(generated)
        next_token_logits = outputs[0][:, -1, :] / temperature
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 
        generated = torch.cat((generated, next_token), dim=1)

out = generated
out = out[:, len(context_tokens):].tolist()
for o in out:
    text = tokenizer.decode(o, clean_up_tokenization_spaces=True)

print(sentence+'|||'+text)

Прошла гроза, умылось небо,
От туч остались облака,
|||Чтоб быть все равно не позабыть,
Как я тебя люблю. (с)


28036259	cooke	2016-06-09 13:00:00	"Красное и черное" 

Одна из самых дорогих кинокартин, которые я когда-либо видел. Фильм снят по мотивам одноименного рассказа Мариво.



























# CharF

In [None]:
!pip install jsonlines
!pip install peft

In [None]:
dataset = load_dataset("IlyaGusev/stihi_ru")
dataset = dataset['train'][100000:101000]

In [5]:
import random
from sklearn.metrics import f1_score
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np
import torch
batch_size = 1
max_length = 256
device='cuda'

In [148]:
peft_model_id = "aanosov/rugpt3med_PREFIX_CAUSAL_cont256_20ep_5000st_15tokens_stihi"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.to(device)

In [82]:
test_data = []
for full_txt in dataset['text']:
    full_txt = '<s>'+full_txt+'</s>'
    if len(full_txt) > max_length:
        test_data.append(full_txt)

In [85]:
class MyDataLoader:
    def __init__(self, data1,  batch_size):
        self.batch_size = batch_size
        self.data =  data1
    
    def __iter__(self):
        return self
    
    def __next__(self):
        x = []
        
        batch = [self.data[i] for i in np.random.choice(len(self.data),self.batch_size)]
        
        for input_txt in batch:
            split_token = torch.randint(max_length, len(input_txt), (1,))
            x_item = input_txt[split_token-max_length:split_token]
            x.append(x_item)

        return x

test_dataloader = MyDataLoader(test_data, batch_size=batch_size)

In [90]:
a = next(iter(test_dataloader))[0]
a

'чально в сердце ложь.\nПосей любовь внутри себя.\nУслышишь музыку дождя.\nЛюбовь в тебе, любовь во мне.\nОдной мы связкой связаны.\nГде есть я, там будешь ты.\nЕсли сердца единой ниткой связаны.\nБегу я впереди себя.\nСначала делаю.\nА думаю потом,\nспасибо солнышку'

In [92]:
for i in range(1, 10):
    print(a[0:i], '||', a[i])

ч || а
ча || л
чал || ь
чаль || н
чальн || о
чально ||  
чально  || в
чально в ||  
чально в  || с


## Finetuned model

In [151]:
predictions = []
answers = []
for _ in range(100):
    txt = next(iter(test_dataloader))[0]
    for i in range(1, len(txt)):
        task = txt[0:i]
        answer = txt[i]
        tokens = tokenizer(task, add_special_tokens=False, return_tensors="pt")
        for k, v in tokens.items():
            tokens[k]=v.to(device)
        output = tokenizer.decode(model.generate(**tokens, max_new_tokens=1)[0][-1])
        predictions.append(output[0])
        answers.append(answer)
print(f1_score(answers, predictions, average='micro'))

0.24301960784313725


## Original model

In [152]:
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = model.to(device)

In [155]:
predictions = []
answers = []
for _ in range(100):
    txt = next(iter(test_dataloader))[0]
    for i in range(1, len(txt)):
        task = txt[0:i]
        answer = txt[i]
        tokens = tokenizer(task, add_special_tokens=False, return_tensors="pt")
        for k, v in tokens.items():
            tokens[k]=v.to(device)
        output = tokenizer.decode(model.generate(**tokens, max_new_tokens=1)[0][-1])
        predictions.append(output[0])
        answers.append(answer)
print(f1_score(answers, predictions, average='micro'))

0.2826666666666667
