In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00

In [None]:
import os
import csv
import json
from io import StringIO

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
GOOGLE_DRIVE_DIR = "/content/drive/MyDrive/"
TOKEN_DIR_NAME = "my_tg_token"
MODEL_DIR_NAME = "my_tg_model"

DEFAULT_TOKEN = "tinkoff-ai/ruDialoGPT-medium"
DEFAULT_MODEL = "tinkoff-ai/ruDialoGPT-medium"

DATA_FILE_NAME = "data.csv"

In [None]:
def find_last_directory(base_dir):

    base_path = GOOGLE_DRIVE_DIR
    max_epoch = -1
    max_epoch_path = ""

    for item in os.listdir(base_path):
        if item.startswith(f"{base_dir}_"):
            try:
                epoch = int(item.split('_')[-1])
                if epoch > max_epoch:
                    max_epoch = epoch
                    max_epoch_path = os.path.join(base_path, item)
            except ValueError:
                continue

    if max_epoch != -1:
        return max_epoch_path, max_epoch
    else:
        return None

In [None]:
from google.colab import files, drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_file_path = GOOGLE_DRIVE_DIR + DATA_FILE_NAME

if not os.path.isfile(data_file_path):
    raise Exception(f"Файл {data_file_path} не найден. Выполнение прервано.")

In [None]:
token_dir, last_token = find_last_directory(TOKEN_DIR_NAME)

if not token_dir:
    token_dir = DEFAULT_TOKEN
    last_token = 0

tokenizer = AutoTokenizer.from_pretrained(token_dir)

print(f"Токенайзер загружен из {token_dir}")

Токенайзер загружен из /content/drive/MyDrive/my_tg_token_0


In [None]:
model_dir, last_epoch = find_last_directory(MODEL_DIR_NAME)

if not model_dir:
    model_dir = DEFAULT_MODEL
    last_epoch = 0

model = AutoModelForCausalLM.from_pretrained(model_dir)

print(f"Модель загружена из {model_dir}")

Модель загружена из /content/drive/MyDrive/my_tg_model_0


In [None]:
with open(data_file_path, 'r', newline='', encoding='utf-8') as file:
    csv_data = file.read()

def csv_to_sequences(csv_str):
    sequences = []
    csv_reader = csv.reader(StringIO(csv_str), delimiter=',', quotechar='"')
    next(csv_reader)
    for row in csv_reader:
        sequence_parts = [f"@@{i.upper()}@@" + cell for i, cell in zip(['ПЕРВЫЙ', 'ВТОРОЙ'], row) if cell and cell != "None"]
        sequence = ' '.join(sequence_parts)
        sequences.append(sequence)
    return sequences


sequences = csv_to_sequences(csv_data)

print(f"Общее количество последовательностей в файле данных: {len(sequences)}")

Общее количество последовательностей в файле данных: 434488


In [None]:
seq_to_train = 60000

if len(sequences) > seq_to_train:
    sequences = sequences[-60000:]
    print(f"Оставляем только последние {seq_to_train} записей")

Оставляем только последние 60000 записей


In [None]:
tokenized_data = tokenizer(sequences, truncation=True, padding='max_length', max_length=300, return_tensors="pt")

last_token += 1
token_save_dir = f"/content/{TOKEN_DIR_NAME}_{last_token}/"

tokenizer.save_pretrained(token_save_dir)
print(f"Токенайзер сохранен в {token_save_dir}")

!cp -r {token_save_dir} {GOOGLE_DRIVE_DIR}
print(f"Копия токенайзера сохранена в {GOOGLE_DRIVE_DIR}")

Токенайзер сохранен в /content/my_tg_token_1/
Копия токенайзера сохранена в /content/drive/MyDrive/


In [None]:
dataset = TensorDataset(tokenized_data["input_ids"], tokenized_data["attention_mask"])
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
# Замораживаем все параметры модели
for param in model.parameters():
    param.requires_grad = False

# Размораживаем параметры головы модели
for param in model.lm_head.parameters():
    param.requires_grad = True

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

optimizer = optim.AdamW(model.lm_head.parameters(), lr=5e-5)
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [None]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50261, bias=False)
)

In [None]:
last_epoch += 1
num_epochs = 5

steps_output = 50

train_losses = []
val_losses = []

model.train()
for epoch in range(last_epoch, last_epoch + num_epochs):
    # Тренировочный цикл
    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):
        inputs, masks = batch
        inputs = inputs.to(device)
        masks = masks.to(device)
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if step != 0 and step % steps_output == 0:
            avg_train_loss = total_train_loss / steps_output
            print(f"Train - Epoch: {epoch}, Step: {step}, Loss: {avg_train_loss}")
            train_losses.append(avg_train_loss)
            total_train_loss = 0

    # Валидационный цикл
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            inputs, masks = batch
            inputs = inputs.to(device)
            masks = masks.to(device)
            outputs = model(inputs, attention_mask=masks, labels=inputs)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Validation - Epoch: {epoch}, Avg Loss: {avg_val_loss}")

    val_losses.append(avg_val_loss)

    model.save_pretrained(f"/content/my_tg_model_{epoch}")
    !cp -r /content/my_tg_model_{epoch}/ /content/drive/MyDrive/

    with open(f'/content/losses_{epoch}.json', 'w') as f:
        json.dump({"train_losses": train_losses, "val_losses": val_losses}, f)
    !cp -r /content/losses_{epoch}.json /content/drive/MyDrive/losses_{epoch}.json

    model.train()

Train - Epoch: 1, Step: 50, Loss: 0.27341154105961324
Train - Epoch: 1, Step: 100, Loss: 0.2821882022870705
Train - Epoch: 1, Step: 150, Loss: 0.2840199191123247
Train - Epoch: 1, Step: 200, Loss: 0.27065788842737676
Train - Epoch: 1, Step: 250, Loss: 0.24455145407474901
Train - Epoch: 1, Step: 300, Loss: 0.26568292509764435
Train - Epoch: 1, Step: 350, Loss: 0.260847893844948
Train - Epoch: 1, Step: 400, Loss: 0.2758875744492616
Train - Epoch: 1, Step: 450, Loss: 0.27446812161009576
Train - Epoch: 1, Step: 500, Loss: 0.2856096728917328
Train - Epoch: 1, Step: 550, Loss: 0.2615465265880064
Train - Epoch: 1, Step: 600, Loss: 0.2608611107204433
Train - Epoch: 1, Step: 650, Loss: 0.2999555806070566
Train - Epoch: 1, Step: 700, Loss: 0.2479170253324446
Train - Epoch: 1, Step: 750, Loss: 0.27138410177081823
Train - Epoch: 1, Step: 800, Loss: 0.3085041203722358
Train - Epoch: 1, Step: 850, Loss: 0.24847603791467918
Train - Epoch: 1, Step: 900, Loss: 0.23770365219563247
Train - Epoch: 1, Step