In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 33.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 21.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 21.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
import numpy as np
import pandas as pd
import re
import random

import torch
from tqdm.notebook import tqdm
import transformers

if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3medium_based_on_gpt2')

Downloading vocab.json:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

В этой ячейке указываем путь на файл с подготовленным текстом ⬇

In [None]:
import re
with open('/content/drive/MyDrive/lost/lost.txt', encoding='utf8') as f:
    text = f.read()

text = re.sub('\n{2,}', '\n', text)
print(text[:300])

"Twitter подал заявление в SEC, они назначили собрание акционеров для голосования по сделке Илона Маска о приобретении за 44 миллиарда долларов на 13 сентября. По сути твиттеряне пытаются подтвердить сделку в сентябре, т.к. основное слушание в суде назначено на октябрь.  "
"Мы привыкли к тому, что ц


In [None]:
tokens = tokenizer.encode(text, add_special_tokens=True)
tokens = np.array(tokens)
print(len(tokens))
tokens[:10]

1097058


array([    6,    56, 12179, 13875,  8348,   282,   715, 23004,    16,
         745])

In [None]:
l = len(tokens)//15
train = []
test = []
for i in range(15):
    if i%5 > 0:
        train.extend(tokens[i*l: (i+1)*l])
    else:
        test.extend(tokens[i*l: (i+1)*l])
train = np.array(train)
test = np.array(test)

print(len(tokens), len(train), len(test))

1097058 877644 219411


In [None]:
import gc
gc.collect()

237

In [None]:
from transformers import GPT2LMHeadModel, AdamW

model = GPT2LMHeadModel.from_pretrained(
    'sberbank-ai/rugpt3medium_based_on_gpt2',
    output_attentions = False,
    output_hidden_states = False,
)

model.to(device);

Downloading pytorch_model.bin:   0%|          | 0.00/1.61G [00:00<?, ?B/s]

In [None]:
batch_size = 4
max_len = 256
epochs = 8

n_train = len(train)//(batch_size*max_len)
n_test = len(test)//(batch_size*max_len)
print(n_train, n_test)

optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5, eps = 1e-8)

total_steps = n_train * epochs
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


def accuracy(y_true, logits):
    return torch.mean((y_true[1:] == torch.argmax(logits, dim=2)[:-1]).float()).detach().cpu().numpy()

857 214


In [None]:
def prep_tensors(x, i, batch_size=batch_size, max_len=max_len):
    batch_ids = x[i*batch_size*max_len: (i+1)*batch_size*max_len]
    batch_ids = batch_ids.reshape(batch_size, max_len)
    batch_ids = torch.tensor(batch_ids).to(device)
    return batch_ids


for epoch in range(1, epochs+1):
    print(f'epoch {epoch}/{epochs} : training')

    train_loss = []
    train_acc = []
    model.train()
    pbar = tqdm(range(n_train))
    for i in pbar:
        batch_ids = prep_tensors(train, i)

        model.zero_grad()
        loss, logits, _ = model(batch_ids,
                             token_type_ids=None, 
                            #  attention_mask=batch_mask,
                             labels=batch_ids
                             ).values()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_loss.append(loss.item())
        train_acc.append(accuracy(batch_ids, logits))
        pbar.set_description(f'acc {np.mean(train_acc):.4f} loss {np.mean(train_loss):.4f}', refresh=True)

    
    print(f'epoch {epoch}/{epochs} : validation')
    model.eval()
    val_acc = []
    val_loss = []
    pbar = tqdm(range(n_test))
    for i in pbar:
        batch_ids = prep_tensors(test, i)
        with torch.no_grad():        
            loss, logits, _ = model(batch_ids, 
                                token_type_ids=None, 
                                # attention_mask=batch_mask,
                                labels=batch_ids
                                 ).values()
        
        val_loss.append(loss.item())
        val_acc.append(accuracy(batch_ids, logits))
        pbar.set_description(f'acc {np.mean(val_acc):.4f} loss {np.mean(val_loss):.4f}', refresh=True)


epoch 1/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 1/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 2/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 2/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 3/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 3/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 4/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 4/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 5/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 5/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 6/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 6/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 7/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 7/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

epoch 8/8 : training


  0%|          | 0/857 [00:00<?, ?it/s]

epoch 8/8 : validation


  0%|          | 0/214 [00:00<?, ?it/s]

Сохраняем веса и опционально уносим их на gdrive

In [None]:
torch.save(model.state_dict(), 'lost_state_dict')

In [None]:
!cp -r lost_state_dict /content/drive/MyDrive/lost/lost_state_dict