In [1]:
import dill
import nltk
import torch
import pymorphy3
import pandas as pd
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, accuracy_score
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from collections import Counter
from itertools import chain

In [None]:
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, embed, hidden, num_heads=2):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed)
        self.norm2 = nn.LayerNorm(embed)
        self.ff = nn.Sequential(
            nn.Linear(embed, hidden),
            nn.GELU(),                 # GELU вместо ReLU
            nn.Dropout(0.1),
            nn.Linear(hidden, embed)
        )
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        # MultiheadAttention принимает key_padding_mask (True = игнорировать токен!)
        attn_out, _ = self.attn(x, x, x, key_padding_mask=~mask.bool() if mask is not None else None)
        x = self.norm1(x + self.dropout1(attn_out))
        x = self.norm2(x + self.dropout2(self.ff(x)))
        return x

class Transformer_Max(nn.Module):
    def __init__(self, vocab_size, embed=128, num_classes=17, hidden=1024, num_heads=2, num_layers=2, max_len=500):
        super().__init__()
        self.tokens = nn.Embedding(vocab_size, embed)
        self.poses = nn.Embedding(max_len, embed)
        self.transes = nn.ModuleList([
            TransformerBlock(embed, hidden, num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed, num_classes)

    def forward(self, x, mask=None):
        batch, seq_len = x.shape
        pos = torch.arange(0, seq_len, dtype=torch.long, device = x.device).unsqueeze(0).expand_as(x)
        x = self.tokens(x) + self.poses(pos)

        # маскируем паддинги
        mask_with_cls = None if mask is None else mask

        # Пропускаем через блоки
        for trans in self.transes:
            x = trans(x, mask=None if mask is None else mask_with_cls)

        x, _ = x.max(dim=1)
        return self.fc(x)

class Transformer(nn.Module):
    def __init__(self, vocab_size, embed=128, num_classes=17, hidden=1024, num_heads=2, num_layers=2, max_len=500):
        super().__init__()
        self.tokens = nn.Embedding(vocab_size+1, embed)
        self.poses = nn.Embedding(max_len+1, embed)
        self.transes = nn.ModuleList([
            TransformerBlock(embed, hidden, num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed, num_classes)
        self.cls = vocab_size

    def forward(self, x, mask=None):
        batch, seq_len = x.shape
        clses = torch.full((batch, 1), self.cls, dtype=torch.long, device=x.device)
        x = torch.cat((clses, x), dim=1)
        pos = torch.arange(0, seq_len+1, dtype=torch.long, device=x.device).unsqueeze(0).expand_as(x)
        x = self.tokens(x) + self.poses(pos)

        # маскируем паддинги
        mask_with_cls = None if mask is None else torch.cat([torch.ones((batch,1), device=x.device), mask], dim=1)

        # Пропускаем через блоки
        for trans in self.transes:
            x = trans(x, mask=None if mask is None else mask_with_cls)

        CLS = x[:, 0, :]
        return self.fc(CLS)

import dill

class Transformer_mix(nn.Module):
    def __init__(self, embed=1024, num_classes=17, hidden=2048, num_heads=16, num_layers=8, max_len=500, path_to_vocab=None, q99=120, classes=None, max_segments=100):
        super().__init__()
        self.vocab = dill.load(open(path_to_vocab, 'rb'))
        self.vocab_size = len(self.vocab)
        self.tokens = nn.Embedding(self.vocab_size+1, embed, padding_idx=0)
        self.poses = nn.Embedding(max_len+1, embed)
        self.segments = nn.Embedding(max_segments, embed)
        self.embed_dropout = nn.Dropout(0.1)
        self.transes = nn.ModuleList(
            TransformerBlock(embed, hidden, num_heads) for _ in range(num_layers)
        )
        self.layernorm = nn.LayerNorm(embed)
        self.fc = nn.Sequential(
            nn.Linear(embed*3, embed*6),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(embed*6, num_classes)
        )
        self.cls = self.vocab_size
        self.q99 = q99
        self.sep_id = self.vocab['<SEP>']
        self.classes = classes

    def predict(self, text, device):
        # Индексы токенов
        indices_tr = [self.vocab.get(word, self.vocab["<UNK>"]) for word in text.split()]
        indices_tr_q99 = indices_tr[:q99]  # обрезка до q99
        # Паддинг
        pad_len = q99 - len(indices_tr_q99)
        indices_tr_q99 = indices_tr_q99 + [self.vocab["<PAD>"]] * pad_len
        # Маска: 1 = реальный токен, 0 = паддинг
        mask = [1] * (len(indices_tr[:q99])) + [0] * pad_len #indices_tr хранит предложения без падинга, так что по его длине максируем [1] и [0] падинги, но отсекаем по q99, если длинный попался
        # В тензоры
        x = torch.tensor(indices_tr_q99, dtype=torch.long).unsqueeze(0).to(device)  # (1, q99)
        mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0).to(device)       # (1, q99)
        # Предсказание
        with torch.no_grad():
            logits = self.forward(x, mask)
            pred = torch.argmax(logits, dim=1).item()
        print(classes[pred])
        return pred

    def forward(self, x, mask=None, return_hidden=False):
        batch, seq = x.shape
        clses = torch.full((batch, 1), self.cls, dtype=torch.long, device=x.device)
        x = torch.cat([clses, x], dim=1)

        pos = torch.arange(0, seq+1, device=x.device).unsqueeze(0).expand(batch, -1)
        # сегменты (находим <SEP>, делаем cumsum)
        #sep_mask = (x == self.sep_id).int()       # где стоят <SEP>, будет 1
        #segments = sep_mask.cumsum(dim=1) % 2 # нумерация сегментов (0,1,2,...) кумулятивной суммой, то есть при каждом попадании единицы будет +1 к сумме, далее значения будут на 1 больше
        #segments = segments.clamp(max=self.segments.num_embeddings - 1) #модель рассчитана на 10 предложений, такой передан параметр, поэтому все предложения после 9 будут иметь индекс 9

        # складываем эмбеддинги
        #x = self.embed_dropout(self.tokens(x) + self.poses(pos) + self.segments(segments))
        x = self.embed_dropout(self.tokens(x) + self.poses(pos))
        # маскируем паддинги
        mask_with_cls = None if mask is None else torch.cat([torch.ones((batch,1), device=x.device), mask], dim=1)

        # Пропускаем через блоки
        for trans in self.transes:
            x = trans(x, mask=None if mask is None else mask_with_cls)
            
        x = self.layernorm(x)

        if return_hidden:
            return x[:, 1:, :]
        
        if mask is not None:
            x_masked = x[:,1:,:] * mask.unsqueeze(-1)   # обнуляем паддинги
            mean_pooling = x_masked.sum(1) / mask.sum(1, keepdim=True).clamp(min=1)
            x_masked_for_max = x[:, 1:, :].masked_fill(mask.unsqueeze(-1) == 0, -1e9) #шумы, прошедшие через модель, по сути фейки, нельзя, чтобы выбрались как максимум вместо реальных данных
            max_pooling, _ = x_masked_for_max.max(1)
        else:
            mean_pooling = x[:,1:,:].mean(1)
            max_pooling, _ = x[:,1:,:].max(1)

        CLS = x[:,0,:]
        out = torch.cat([CLS, mean_pooling, max_pooling], dim=1)
        return self.fc(out)

class Classificator(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x):
        x = self.ff(x)
        return x

from sklearn.metrics import f1_score, accuracy_score

def train(model, model_name:str, optimizer, loss_fn, train_loader, test_loader, scheduler, epochs, device):
    model, loss_fn = model.to(device), loss_fn.to(device)
    losses, f1s, accs = [], [], []
    best_acc, idx, best_idx, grad_norm = 0, 0, 0, 0
    for epoch in range(epochs):
        model.train()
        for x, mask, y in train_loader:
            x, mask, y = x.to(device), mask.to(device), y.to(device, dtype=torch.long)
            pred = model(x, mask)

            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            # ограничиваем градиенты перед шагом оптимизатора
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        f1, acc, val_loss = validate(model, loss_fn, test_loader, device)
        model.train()
        scheduler.step(val_loss)
        
        f1s.append(f1)
        accs.append(acc)
        losses.append(val_loss)
        idx += 1
        if acc>best_acc:
          torch.save(model.state_dict(), f'weights_{model_name}.pt')
          best_acc = acc
          best_idx = idx
          print(f"✅ Saved new best model to {f'weights_{model_name}.pt'}")
        print(f"Epoch {epoch+1}/{epochs} | loss={val_loss:.4f}, f1={f1:.4f}, acc={acc:.4f}")
    print(f'best_epoch {best_idx} with acc {best_acc}')
    return losses, f1s, accs


def validate(model, loss_fn, test_loader, device):
    model.eval()
    y_true, y_pred = [], []
    loss_sum, total = 0, 0

    with torch.no_grad():
        for x, mask, y in test_loader:
            x, mask, y = x.to(device), mask.to(device), y.to(device, dtype=torch.long)
            pred = model(x, mask)

            loss = loss_fn(pred, y)
            loss_sum += loss.item()

            preds = pred.argmax(dim=1)
            y_pred.extend(preds.cpu().numpy())
            y_true.extend(y.cpu().numpy())
            total += y.size(0)

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    return f1, acc, loss_sum / len(test_loader)


In [6]:
with open('anek_2ch_vocab_20000.pkl', 'rb') as file:
    vocab_small = dill.load(file)
def tensoring(xtr,xte,ytr,yte, vocab):    
    sents_tr = [x.split() for x in xtr.tolist()]
    sents_te = [x.split() for x in xte.tolist()]

    indices_tr = [[vocab.get(word, vocab["<UNK>"]) for word in sent] for sent in sents_tr]
    indices_te = [[vocab.get(word, vocab["<UNK>"]) for word in sent] for sent in sents_te]

    length_tr = [len(sent.split()) for sent in xtr.to_list()]
    q99 = int(pd.DataFrame(length_tr).quantile(0.99))
    print(f"q99 длины = {q99}")
    indices_tr_q99 = [sent[:q99] for sent in indices_tr if len(sent) > 0]
    indices_te_q99 = [sent[:q99] for sent in indices_te if len(sent) > 0]

    seqs_tr = [torch.tensor(seq) for seq in indices_tr_q99]
    seqs_te = [torch.tensor(seq) for seq in indices_te_q99]

    padded_tr = pad_sequence(seqs_tr, batch_first=True, padding_value=vocab["<PAD>"])
    padded_te = pad_sequence(seqs_te, batch_first=True, padding_value=vocab["<PAD>"])

    ytr_tensor = torch.tensor(ytr.values, dtype=torch.long)
    yte_tensor = torch.tensor(yte.values, dtype=torch.long)

    # attention_mask: 1 для токена, 0 для паддинга
    tr_mask = (padded_tr != 0).long()
    te_mask = (padded_te != 0).long()

    train_set2 = TensorDataset(padded_tr, tr_mask, ytr_tensor)
    test_set2 = TensorDataset(padded_te, te_mask, yte_tensor)

    train_loader2 = DataLoader(train_set2, batch_size=100)
    test_loader2 = DataLoader(test_set2, batch_size=100)
    return train_loader2, test_loader2
def preprocess(text):
    # токенизация (только слова, без пунктуации)
    tokens = word_tokenize(text, language="russian")
    # лемматизация + удаление стоп-слов и чисел
    sep_tokens = {".", "?", "!", "..."}
    lemmas = []
    for token in tokens:
        if token in sep_tokens:
            lemmas.append("<SEP>")
        elif token.isalpha() and token not in stop_words:
            lemmas.append(morph.parse(token)[0].normal_form)

    return " ".join(lemmas)
def prepare():
    df = pd.read_csv('m18_jokes_dataset.csv')

    df['anekdot'] = df['text'].apply(preprocess)
    df.drop(['text'], axis=1, inplace=True)

    df = df[~df['anekdot'].isna()]
    df = df[df['anekdot'].str.strip() != ""]

    le = LabelEncoder()
    df['temi'] = le.fit_transform(df['theme'])
    df.drop(['theme'],axis=1, inplace=True)
    xtr, xte, ytr, yte = train_test_split(df['anekdot'], df['temi'], test_size=.1, random_state=42, stratify=df['temi'], shuffle=True)
    return xtr, xte, ytr, yte
morph = pymorphy3.MorphAnalyzer()
stop_words = set(stopwords.words("russian")) - {'не', 'ну', 'вот'}

In [13]:
xtr, xte, ytr, yte = prepare()
with open('anek_2ch_vocab_20000.pkl', 'rb') as file:
    vocab_small = dill.load(file)
# if '<MASK>' not in vocab_small:
#     vocab_small['<MASK>'] = len(vocab_small)
train_loader3, test_loader3 = tensoring(xtr,xte,ytr,yte, vocab_small)

q99 длины = 119


  q99 = int(pd.DataFrame(length_tr).quantile(0.99))


In [None]:
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import dill
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
import math

with open('df_2ch.pkl', 'rb') as f:
    df_2ch = dill.load(f)

with open('xtr_with_2ch.pkl', 'rb') as f:
    xtr_with_2ch = dill.load(f)

with open('xte.pkl', 'rb') as f:
    xte = dill.load(f)

with open('anek_2ch_vocab_20000.pkl', 'rb') as file:
    vocab_small = dill.load(file)
    
# with open('anek_2ch_vocab.pkl', 'rb') as file:
#     vocab_with_2ch = dill.load(file)

if '<MASK>' not in vocab_small:
    vocab_small['<MASK>'] = len(vocab_small)
mask_id = vocab_small['<MASK>']
pad_id = vocab_small['<PAD>']
sep_id = vocab_small['<SEP>']
cls_id = len(vocab_small)

def mask_tokens(batch_ids, mask_prob=0.15):
    """
    На вход подаётся батч индексов (batch_size, seq_len)
    Возвращает:
      - masked: тот же тензор, где 15% токенов заменены на <MASK>
      - labels: тот же тензор, но все НЕзамаскированные позиции = -100 (игнорируются в CrossEntropyLoss)
    """
    device = batch_ids.device     # берём устройство входного батча
    masked = batch_ids.clone()    # копия входов
    labels = batch_ids.clone()    # копия входов для обучения
    rand = torch.rand(batch_ids.shape, device=device)    # равномерное распределение [0,1)
    # приводим идентификаторы к тензорам на том же устройстве
    pad_t = torch.tensor(pad_id, device=device)
    cls_t = torch.tensor(cls_id, device=device)
    sep_t = torch.tensor(sep_id, device=device)

    # Выбираем токены, которые заменим на MASK (15%)
    mask_arr = (rand < mask_prob) & (batch_ids != pad_t) & (batch_ids != cls_t) & (batch_ids != sep_t)

    # распределение как в BERT
    # 80% токенов <MASK>
    mask_mask = (torch.rand(batch_ids.shape, device=device) < 0.8) & mask_arr
    masked[mask_mask] = mask_id

    # 10% случайные токены
    rand_replace = (torch.rand(batch_ids.shape, device=device) < 0.1) & mask_arr & ~mask_mask
    random_words = torch.randint(len(vocab_small), batch_ids.shape, device=device)
    masked[rand_replace] = random_words[rand_replace]

    # 10% остаются как есть (уже handled автоматически)

    # Все не замаскированные метки делаем -100 (чтобы не учитывались в loss)
    labels[~mask_arr] = -100

    return masked, labels  # форма (batch_size, seq_len)

# Формируем корпус для самопредобучения: анекдоты + 2ch
#texts_unsup = pd.concat([df_2ch, df['anekdot']]).drop_duplicates().tolist()

# Замена на индексы
tokenized_unsup = xtr_with_2ch.copy()#[[vocab_small.get(token, vocab_small['<UNK>']) for token in text]for text in df_with_2ch]
val_set = xte.copy()
# Обрезание до q99 и тензоризация
lengths = [len(sent) for sent in tokenized_unsup]
quantile99 = int(pd.DataFrame(lengths).quantile(0.99))
tokenized_unsup = [torch.tensor(sent[:quantile99]) for sent in tokenized_unsup if len(sent)>0]
val_set = [torch.tensor(sent[:quantile99]) for sent in val_set if len(sent)>0]
# Паддинг до максимальной длины внутри батча
padded_unsup = pad_sequence(tokenized_unsup, batch_first=True, padding_value=pad_id)
padded_val = pad_sequence(val_set, batch_first=True, padding_value=pad_id)

# (batch_size, seq_len) теперь все строки одинаковой длины

# Attention mask: 1 для токенов, 0 для паддингов
mask_unsup = (padded_unsup != pad_id).long()
mask_val = (padded_val != pad_id).long()
# (batch_size, seq_len)

# Делаем датасет
unsup_set = TensorDataset(padded_unsup, mask_unsup)
unsup_loader = DataLoader(unsup_set, batch_size=32, shuffle=True, num_workers=2)
val_tenset = TensorDataset(padded_val, mask_val)
val_loader = DataLoader(val_tenset, batch_size=32, num_workers=2)
# В каждом батче:
#   x.shape = (64, seq_len)
#   mask.shape = (64, seq_len)

# Добавляем новую "голову" для MLM. У модели добавится второй выход. Первый так и остается для классификации, второй для восстановления замаскированных токенов

# У модели Transformer_mix выход из self.fc имеет вид (batch_size, num_classes)
# Но нам нужен выход по токенам для каждого токена предсказать слово.
# Поэтому добавляем линейный слой fc_mlm, который будет восстанавливать исходный токен.
# Его вход скрытое представление каждого токена (embed_dim), выход размер словаря (len(vocab)).
model_class_mix2 = Transformer_mix(len(vocab_small), q99=quantile99, path_to_vocab='anek_2ch_vocab_20000.pkl')
embed = model_class_mix2.tokens.embedding_dim
model_class_mix2.fc_mlm = nn.Sequential(nn.Dropout(0.1), nn.Linear(embed, len(vocab_small)))
# аккуратно копируем веса (без CLS-вектора)
with torch.no_grad():
    model_class_mix2.fc_mlm[1].weight.copy_(model_class_mix2.tokens.weight[:-1]) # копируем веса эмбеддингов словаря из этапа "чтения модели" в веса этапа "угадывания токенов", потому что это действие обратное пониманию. Linear лежит вторым в Sequential

for p in model_class_mix2.fc.parameters():
    p.requires_grad = False # замораживаем веса для классификатора, мы обучаем вторую голову

# fc[0].in_features // 3 потому что в модели fc принимает concat из CLS + mean + max
#   (embed * 3), так что одно "ядро" embed
# Размеры:
#   logits.shape = (batch_size, seq_len, vocab_size)
# теперь модель высчитывает два результата, она годится и для классификации, и для генерации текста, по желанию можно вызвать результат model.fc через model(x, attn_mask), это основной выход, потому что он указан в return, или вернуть model.fc_mlm через через model(x, attn_mask, return_hidden=True)

mlm_criterion = nn.CrossEntropyLoss(ignore_index=-100)  # не считаем loss на не-масках
no_decay = ["bias", "LayerNorm.weight", "layernorm.weight", "norm.weight"]
param_groups = [
    {"params": [p for n,p in model_class_mix2.named_parameters()
                if not any(nd in n for nd in no_decay)],
     "weight_decay": 5e-5},
    {"params": [p for n,p in model_class_mix2.named_parameters()
                if any(nd in n for nd in no_decay)],
     "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(param_groups, lr=5e-5, betas=(0.9, 0.98), eps=1e-8)

# Обучение Masked Language Model
epochs = 16
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

  quantile99 = int(pd.DataFrame(lengths).quantile(0.99))


In [None]:
from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=4000, num_training_steps=len(unsup_loader)*epochs
)

def train_mlm(model, loss_fn, optimizer, train_loader, val_loader, device, scheduler, epochs=5):
    
    # Настройка scheduler
    # num_training_steps = len(train_loader) * epochs
    # num_warmup_steps = int(num_training_steps * warmup_ratio)
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
    
    # Подготовка 
    model.to(device)
    model.train()
    best_val_acc = 0

    # Предсоздание маскированной валидации (фиксированной). Мы должны использовать одинаково замаскированные входы на треине и валидации для стабильности обучения
    val_data = []
    for x, attn_mask in val_loader:
        x, attn_mask = x.to(device), attn_mask.to(device)
        masked_x, labels = mask_tokens(x)
        val_data.append((x, attn_mask, masked_x, labels))

    for epoch in range(epochs):
        total_loss, total_acc, grad_norm_sum = 0, 0, 0

        for x, attn_mask in tqdm(train_loader, desc=f"MLM Epoch {epoch+1}/{epochs}"):
            x, attn_mask = x.to(device), attn_mask.to(device)
            masked_x, labels = mask_tokens(x)

            # Forward
            hiddens = model(masked_x, attn_mask, return_hidden=True)
            logits = model.fc_mlm(hiddens)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

            optimizer.zero_grad()
            loss.backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            grad_norm_sum += grad_norm.item()
            optimizer.step()
            scheduler.step()

            # Метрики 
            total_loss += loss.item()
            with torch.no_grad():
                preds = logits.argmax(dim=-1)
                mask_pos = labels != -100
                total_acc += (preds[mask_pos] == labels[mask_pos]).float().mean().item()
        print("lr=", optimizer.param_groups[0]["lr"])
        # Усреднение
        avg_loss = total_loss / len(train_loader)
        avg_acc = total_acc / len(train_loader)
        avg_grad = grad_norm_sum / len(train_loader)
        print(f"Train | Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | MaskedAcc: {avg_acc:.4f} | GradNorm: {avg_grad:.3f}")

        # Валидация
        val_loss, val_acc = validate_mlm(model, loss_fn, val_data, device)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'weights_mlm_1024.pt')
            print(f"✅ Saved new best model (val_acc={val_acc:.4f})")

    print(f"\nTraining finished. Best val_acc: {best_val_acc:.4f}, Final_loss {val_loss}")
    
def validate_mlm(model, loss_fn, val_data, device):
    model.eval()
    total_loss, total_acc = 0, 0

    with torch.no_grad():
        for x, attn_mask, masked_x, labels in tqdm(val_data, desc="Validating"):
            hiddens = model(masked_x, attn_mask, return_hidden=True)
            logits = model.fc_mlm(hiddens)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
            total_loss += loss.item()

            preds = logits.argmax(dim=-1)
            mask_pos = labels != -100
            total_acc += (preds[mask_pos] == labels[mask_pos]).float().mean().item()

    avg_loss = total_loss / len(val_data)
    avg_acc = total_acc / len(val_data)
    print(f"Validation | Loss: {avg_loss:.4f} | MaskedAcc: {avg_acc:.4f}")
    model.train()
    return avg_loss, avg_acc

In [12]:
train_mlm(model_class_mix2, mlm_criterion, optimizer, unsup_loader, val_loader, device, scheduler, epochs)

MLM Epoch 1/16: 100%|██████████| 11234/11234 [26:09<00:00,  7.16it/s]


lr= 4.7941892753095415e-05
Train | Epoch 1/16 | Loss: 40.0899 | MaskedAcc: 0.1874 | GradNorm: 89.572


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.92it/s]


Validation | Loss: 16.7842 | MaskedAcc: 0.3065
✅ Saved new best model (val_acc=0.3065)


MLM Epoch 2/16: 100%|██████████| 11234/11234 [27:45<00:00,  6.75it/s]


lr= 4.474576656955572e-05
Train | Epoch 2/16 | Loss: 12.2023 | MaskedAcc: 0.2311 | GradNorm: 38.699


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.86it/s]


Validation | Loss: 6.0039 | MaskedAcc: 0.3083
✅ Saved new best model (val_acc=0.3083)


MLM Epoch 3/16: 100%|██████████| 11234/11234 [27:39<00:00,  6.77it/s]


lr= 4.154964038601602e-05
Train | Epoch 3/16 | Loss: 5.9601 | MaskedAcc: 0.3040 | GradNorm: 23.891


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.94it/s]


Validation | Loss: 5.8351 | MaskedAcc: 0.3093
✅ Saved new best model (val_acc=0.3093)


MLM Epoch 4/16: 100%|██████████| 11234/11234 [27:42<00:00,  6.76it/s]


lr= 3.835351420247633e-05
Train | Epoch 4/16 | Loss: 5.8094 | MaskedAcc: 0.3071 | GradNorm: 19.184


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.36it/s]


Validation | Loss: 5.7066 | MaskedAcc: 0.3075


MLM Epoch 5/16: 100%|██████████| 11234/11234 [27:36<00:00,  6.78it/s]


lr= 3.515738801893664e-05
Train | Epoch 5/16 | Loss: 5.7423 | MaskedAcc: 0.3088 | GradNorm: 17.310


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.96it/s]


Validation | Loss: 5.6294 | MaskedAcc: 0.3063


MLM Epoch 6/16: 100%|██████████| 11234/11234 [27:36<00:00,  6.78it/s]


lr= 3.1961261835396946e-05
Train | Epoch 6/16 | Loss: 5.6771 | MaskedAcc: 0.3110 | GradNorm: 16.886


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.24it/s]


Validation | Loss: 5.5630 | MaskedAcc: 0.3106
✅ Saved new best model (val_acc=0.3106)


MLM Epoch 7/16: 100%|██████████| 11234/11234 [27:41<00:00,  6.76it/s]


lr= 2.876513565185725e-05
Train | Epoch 7/16 | Loss: 5.6368 | MaskedAcc: 0.3117 | GradNorm: 17.200


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.97it/s]


Validation | Loss: 5.4825 | MaskedAcc: 0.3120
✅ Saved new best model (val_acc=0.3120)


MLM Epoch 8/16: 100%|██████████| 11234/11234 [27:36<00:00,  6.78it/s]


lr= 2.5569009468317557e-05
Train | Epoch 8/16 | Loss: 5.5830 | MaskedAcc: 0.3144 | GradNorm: 15.704


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.92it/s]


Validation | Loss: 5.4316 | MaskedAcc: 0.3100


MLM Epoch 9/16: 100%|██████████| 11234/11234 [27:37<00:00,  6.78it/s]


lr= 2.237288328477786e-05
Train | Epoch 9/16 | Loss: 5.5377 | MaskedAcc: 0.3157 | GradNorm: 15.628


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.99it/s]


Validation | Loss: 5.3688 | MaskedAcc: 0.3167
✅ Saved new best model (val_acc=0.3167)


MLM Epoch 10/16: 100%|██████████| 11234/11234 [27:40<00:00,  6.77it/s]


lr= 1.9176757101238165e-05
Train | Epoch 10/16 | Loss: 5.5047 | MaskedAcc: 0.3163 | GradNorm: 15.197


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.10it/s]


Validation | Loss: 5.3109 | MaskedAcc: 0.3190
✅ Saved new best model (val_acc=0.3190)


MLM Epoch 11/16: 100%|██████████| 11234/11234 [27:37<00:00,  6.78it/s]


lr= 1.5980630917698473e-05
Train | Epoch 11/16 | Loss: 5.4697 | MaskedAcc: 0.3178 | GradNorm: 15.197


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.21it/s]


Validation | Loss: 5.2691 | MaskedAcc: 0.3208
✅ Saved new best model (val_acc=0.3208)


MLM Epoch 12/16: 100%|██████████| 11234/11234 [27:41<00:00,  6.76it/s]


lr= 1.2784504734158779e-05
Train | Epoch 12/16 | Loss: 5.4487 | MaskedAcc: 0.3178 | GradNorm: 15.088


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.04it/s]


Validation | Loss: 5.2397 | MaskedAcc: 0.3203


MLM Epoch 13/16: 100%|██████████| 11234/11234 [27:43<00:00,  6.75it/s]


lr= 9.588378550619083e-06
Train | Epoch 13/16 | Loss: 5.4266 | MaskedAcc: 0.3186 | GradNorm: 15.163


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.12it/s]


Validation | Loss: 5.2078 | MaskedAcc: 0.3218
✅ Saved new best model (val_acc=0.3218)


MLM Epoch 14/16: 100%|██████████| 11234/11234 [27:43<00:00,  6.75it/s]


lr= 6.392252367079389e-06
Train | Epoch 14/16 | Loss: 5.4059 | MaskedAcc: 0.3190 | GradNorm: 15.414


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.98it/s]


Validation | Loss: 5.1882 | MaskedAcc: 0.3252
✅ Saved new best model (val_acc=0.3252)


MLM Epoch 15/16: 100%|██████████| 11234/11234 [27:38<00:00,  6.77it/s]


lr= 3.1961261835396947e-06
Train | Epoch 15/16 | Loss: 5.3908 | MaskedAcc: 0.3193 | GradNorm: 15.288


Validating: 100%|██████████| 54/54 [00:03<00:00, 17.92it/s]


Validation | Loss: 5.1692 | MaskedAcc: 0.3251


MLM Epoch 16/16: 100%|██████████| 11234/11234 [27:38<00:00,  6.77it/s]


lr= 0.0
Train | Epoch 16/16 | Loss: 5.3763 | MaskedAcc: 0.3200 | GradNorm: 15.589


Validating: 100%|██████████| 54/54 [00:02<00:00, 18.19it/s]

Validation | Loss: 5.1578 | MaskedAcc: 0.3232

Training finished. Best val_acc: 0.3252, Final_loss 5.157759339721115





In [None]:
with open('anek_2ch_vocab_20000.pkl', 'rb') as file:
    vocab_small = dill.load(file)
# if '<MASK>' not in vocab_small:
#     vocab_small['<MASK>'] = len(vocab_small)
with open('xtr_with_2ch.pkl', 'rb') as file:
    xtr_with_2ch = dill.load(file)
lengths = [len(sent) for sent in xtr_with_2ch]
quantile99 = int(pd.DataFrame(lengths).quantile(0.99))
model_class_mix2 = Transformer_mix(len(vocab_small), q99=quantile99, path_to_vocab='anek_2ch_vocab_20000.pkl')
# embed = model_class_mix2.tokens.embedding_dim
# model_class_mix2.fc_mlm = nn.Linear(embed, len(vocab_small))
# аккуратно копируем веса (без CLS-вектора)
# with torch.no_grad():
#     model_class_mix2.fc_mlm.weight.copy_(model_class_mix2.tokens.weight[:-1]) # копируем веса эмбеддингов словаря из этапа "чтения модели" в веса этапа "угадывания токенов", потому что это действие обратное пониманию
# Загружаем предобученные веса
# Загрузить state_dict
state = torch.load("weights_mlm_1024.pt", map_location="cpu")

# Удалить последний (<MASK>) вектор из весов, потому что эта голова была временной для обучения MLM, для классификации она не нужна и в оригинальном классе ее нет
for key in ["tokens.weight", "fc_mlm.weight", "fc_mlm.bias"]:
    if key in state:
        if key.endswith(".weight"):
            state[key] = state[key][:-1, :]
        elif key.endswith(".bias"):
            state[key] = state[key][:-1]
model_class_mix2.load_state_dict(state, strict=False)
print("✅ Загружены предобученные MLM-веса")
# Замораживаем эмбеддинги
for p in model_class_mix2.tokens.parameters():
    p.requires_grad = False
for p in model_class_mix2.poses.parameters():
    p.requires_grad = False
# Размораживаем классификатор fc
for p in model_class_mix2.fc.parameters():
    p.requires_grad = True
# Размораживаем последние 3 слоя энкодера
for layer in model_class_mix2.transes[-3:]:
    for p in layer.parameters():
        p.requires_grad = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_fn_mix = nn.CrossEntropyLoss()
optimizer_mix = torch.optim.Adam(model_class_mix2.parameters(), lr=1e-5, weight_decay=1e-4) #L2 регуляризация
scheduler_mix = ReduceLROnPlateau(optimizer_mix, mode='min', factor=0.5, patience=1)
epochs = 50
losses_mix, f1s_mix, accs_mix = train(model_class_mix2, 'mlm_classification_20000_1024', optimizer_mix, loss_fn_mix, train_loader3, test_loader3, scheduler_mix, epochs, device)

  quantile99 = int(pd.DataFrame(lengths).quantile(0.99))


✅ Загружены предобученные MLM-веса
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 1/50 | loss=1.8464, f1=0.4096, acc=0.4459
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 2/50 | loss=1.5415, f1=0.5141, acc=0.5359
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 3/50 | loss=1.4523, f1=0.5501, acc=0.5688
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 4/50 | loss=1.3913, f1=0.5789, acc=0.5976
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 5/50 | loss=1.3659, f1=0.5909, acc=0.6088
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 6/50 | loss=1.3684, f1=0.5971, acc=0.6141
✅ Saved new best model to weights_mlm_classification_20000_1024.pt
Epoch 7/50 | loss=1.3841, f1=0.6032, acc=0.6194
Epoch 8/50 | loss=1.4031, f1=0.5987, acc=0.6118
Epoch 9/50 | loss=1.4215, f1=0.6001, acc=0.6106
Epoch 10/50 | loss=1.4373, f1=0.5934, acc=0.6041
Epoch 11/50 | l

In [None]:
import torch
import torch.nn as nn
import dill
import pandas as pd

# Загружаем словарь
with open('anek_2ch_vocab_20000.pkl', 'rb') as file:
    vocab_small_mlm = dill.load(file)
if '<MASK>' not in vocab_small_mlm:
    vocab_small_mlm['<MASK>'] = len(vocab_small_mlm)

lengths = [len(sent) for sent in dill.load(open('xtr_with_2ch.pkl', 'rb'))]
q99 = int(pd.DataFrame(lengths).quantile(0.99))

model_mlm = Transformer_mix(
    vocab_size=len(vocab_small_mlm),
    embed=1024,
    hidden=2048,
    num_heads=16,
    num_layers=8,
    q99=q99,
    path_to_vocab='anek_2ch_vocab_20000.pkl'
)

# Добавляем голову MLM 
embed_dim = model_mlm.tokens.embedding_dim
model_mlm.fc_mlm = nn.Sequential(
    nn.Dropout(0.1),
    nn.Linear(embed_dim, len(vocab_small_mlm))
)

# Загружаем веса MLM
state = torch.load("weights_mlm_1024.pt", map_location="cpu")
model_mlm.load_state_dict(state, strict=False)
model_mlm.eval()

import re

def inf_preprocess(text):
    # заменяем спецтокены временными плейсхолдерами
    text = re.sub(r"<MASK>", " MASKTOKEN ", text)
    text = re.sub(r"<SEP>", " SEPTOKEN ", text)

    tokens = word_tokenize(text, language="russian")
    sep_tokens = {".", "?", "!", "..."}
    lemmas = []

    for token in tokens:
        # возвращаем спецтокены обратно
        if token == "MASKTOKEN":
            lemmas.append("<MASK>")
        elif token == "SEPTOKEN":
            lemmas.append("<SEP>")
        elif token.isalpha() and token not in stop_words:
            lemmas.append(morph.parse(token)[0].normal_form)

    return " ".join(lemmas)

# Функция инференса
def mlm_infer(model, input_text, vocab, device, top_k=5):
    model.eval()
    with torch.no_grad():
        text = inf_preprocess(input_text)
        tokens = text.split()
        ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]
        x = torch.tensor(ids).unsqueeze(0).to(device)
        mask = (x != vocab["<PAD>"]).long()

        hiddens = model(x, mask, return_hidden=True)
        logits = model.fc_mlm(hiddens)
        probs = torch.softmax(logits, dim=-1)

        inv_vocab = {v: k for k, v in vocab.items()}
        restored = []
        candidates = []
        for i, token in enumerate(tokens):
            if token == "<MASK>":
                top_preds = torch.topk(probs[0, i], k=top_k)
                candidates = [inv_vocab[idx.item()] for idx in top_preds.indices if idx != vocab['<UNK>']]
                restored.append(f"[{', '.join(candidates)}]")
            elif token == "<SEP>":  
                restored.append('.')
            else:
                restored.append(token)
        preprocessed_answer = " ".join(restored)
        return re.sub(r"<MASK>", str(candidates), input_text)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_mlm = model_mlm.to(device)

  q99 = int(pd.DataFrame(lengths).quantile(0.99))


In [7]:
text = "Я сегодня отдохнул хорошо. Я завтра буду работать весь <MASK> чтобы закончить проект"
text2 = "Прошлый <MASK> был ужасен, я ничего не добился с января по декабрь"
text3 = 'Какой хороший <MASK>. Я заценил и поставил высокую <MASK>'
text4 = 'на горе стоит статуя у статуи нету <MASK>'
text5 = 'Я смотрел новый <MASK> в кинотеатре.'
text6 = 'Сегодня по телевизору опять покажут новый <MASK>'
print(mlm_infer(model_mlm, text, vocab_small_mlm, device))

Я сегодня отдохнул хорошо. Я завтра буду работать весь ['день', 'быть', 'не', 'мир'] чтобы закончить проект


In [40]:
model_classification = Transformer_mix(len(vocab_small), path_to_vocab='anek_2ch_vocab_20000.pkl')
model_classification.load_state_dict(torch.load('weights_mlm_classification_20000_1024.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_fn_mix = nn.CrossEntropyLoss()
model_classification = model_classification.to(device)
classes = {0: 'aforizmi',
            1: 'meditsinskie',
            2: 'narodnie',
            3: 'poshlie-i-intimnie',
            4: 'pro-alkogolikov',
            5: 'pro-armiu',
            6: 'pro-detey',
            7: 'pro-evreev',
            8: 'pro-militsiyu',
            9: 'pro-mugchin',
            10: 'pro-novih-russkih',
            11: 'pro-semyu',
            12: 'pro-studentov',
            13: 'pro-vovochku',
            14: 'raznie',
            15: 'shkolnie-i-pro-shkolu',
            16: 'tsitati'}
anek = 'Черчилль спрашивает Сталина что бы вы с вашей армией сделали с Гитлером, будь он у вас в руках? Сталин отвечает раскалил бы кочергу докрасна и засунул бы холодным концом ему в задницу. -А почему холодным, товарищ Сталин? -Чтобы вы, господин Черчилль, не помогли ему ее вытащить'
anek2 = 'Приходит пациент к доктору и говорит: доктор, хуй чешется. Доктор отвечает: мой чаще. -Нет, мой'
anek3 = "— Вовочка, ты почему не сделал домашнее задание? — А я в шахматы играл с папой."
model_classification.predict(anek3, device)

tsitati


16

In [11]:
df = pd.read_csv('m18_jokes_dataset.csv')
data = df['text'].sample(1000)
preds_mix = [model_classification.predict(text, device="cuda") for text in data]
d = dict(zip(data, [classes[pred] for pred in preds_mix]))
dd = pd.DataFrame(list(d.items()), columns=["текст", "тема"])

aforizmi
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
pro-alkogolikov
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
pro-detey
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
pro-militsiyu
tsitati
tsitati
tsitati
tsitati
pro-detey
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
pro-alkogolikov
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
shkolnie-i-pro-shkolu
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
aforizmi
tsitati
shkolnie-i-pro-shkolu
tsitati
tsitati
narodnie
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati
tsitati

Модель обучилась на разговорном языке с форума 2ch и теперь склонна выбирать анекдотам цитаты, потому что жанр этого языка похож на 2ch.
Так что нужно обучить еще пару эпох с заниженным весом для tsitati

In [None]:
mask = ytr != 16 # "tsitati"
xtr_sub, ytr_sub = xtr[mask], ytr[mask]
train_loader_balanced, _ = tensoring(xtr_sub, xte, ytr_sub, yte, vocab_small) #собрали датасет без цитатных анекдотов, модель сдвинется от склонности присваивать цитаты
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Разморозка нужных частей
for p in model_classification.tokens.parameters():
    p.requires_grad = False
for p in model_classification.poses.parameters():
    p.requires_grad = False
for p in model_classification.fc.parameters():
    p.requires_grad = True
for layer in model_classification.transes[-3:]:
    for p in layer.parameters():
        p.requires_grad = True

# Веса классов только по новым классам
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(ytr_sub),  
    y=ytr_sub
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device) # у нас 16 новых сбалансированных весов
class_weights = torch.cat([class_weights, torch.tensor([0.0], device=device)]) # Поэтому добавим нулевой вес цитатам, иначе не совпадет размерность, классов то 17 изначально

# Оптимизатор и loss
loss_fn_mix = nn.CrossEntropyLoss(weight=class_weights)
optimizer_mix = torch.optim.Adam(model_classification.parameters(), lr=1e-5, weight_decay=1e-4)
scheduler_mix = ReduceLROnPlateau(optimizer_mix, mode='min', factor=0.5, patience=1)

# Обучение
epochs = 5
losses_mix, f1s_mix, accs_mix = train(
    model_classification,
    'mlm_classification_20000_1024_no_tsitati',
    optimizer_mix,
    loss_fn_mix,
    train_loader_balanced,  # новый loader
    test_loader3,
    scheduler_mix,
    epochs,
    device
)

  q99 = int(pd.DataFrame(length_tr).quantile(0.99))


q99 длины = 120
✅ Saved new best model to weights_mlm_classification_20000_1024_no_tsitati.pt
Epoch 1/5 | loss=1.3638, f1=0.5669, acc=0.5847
✅ Saved new best model to weights_mlm_classification_20000_1024_no_tsitati.pt
Epoch 2/5 | loss=1.4086, f1=0.5653, acc=0.5859
Epoch 3/5 | loss=1.4334, f1=0.5623, acc=0.5841
Epoch 4/5 | loss=1.4817, f1=0.5581, acc=0.5806
Epoch 5/5 | loss=1.5069, f1=0.5594, acc=0.5806
best_epoch 2 with acc 0.5858823529411765


In [None]:
model_classification.load_state_dict(torch.load('weights_mlm_classification_20000_1024_no_tsitati.pt'))
df = pd.read_csv('m18_jokes_dataset.csv')
data = df['text'].sample(1000)
preds_mix = [model_classification.predict(text, device="cuda") for text in data]
d = dict(zip(data, [classes[pred] for pred in preds_mix]))
dd = pd.DataFrame(list(d.items()), columns=["текст", "тема"])
# склонность присваивать цитаты ушла

aforizmi
shkolnie-i-pro-shkolu
shkolnie-i-pro-shkolu
raznie
narodnie
pro-alkogolikov
pro-alkogolikov
pro-vovochku
raznie
narodnie
raznie
pro-alkogolikov
raznie
narodnie
aforizmi
raznie
pro-alkogolikov
raznie
pro-alkogolikov
raznie
pro-militsiyu
narodnie
raznie
shkolnie-i-pro-shkolu
poshlie-i-intimnie
aforizmi
raznie
raznie
pro-militsiyu
pro-studentov
raznie
pro-detey
narodnie
pro-mugchin
pro-alkogolikov
aforizmi
raznie
pro-alkogolikov
aforizmi
shkolnie-i-pro-shkolu
raznie
raznie
pro-armiu
aforizmi
pro-alkogolikov
raznie
raznie
shkolnie-i-pro-shkolu
raznie
pro-studentov
narodnie
pro-militsiyu
pro-detey
shkolnie-i-pro-shkolu
raznie
pro-alkogolikov
pro-alkogolikov
pro-studentov
raznie
narodnie
pro-alkogolikov
pro-alkogolikov
aforizmi
pro-studentov
narodnie
pro-militsiyu
raznie
pro-alkogolikov
poshlie-i-intimnie
pro-alkogolikov
pro-alkogolikov
narodnie
narodnie
pro-alkogolikov
aforizmi
pro-alkogolikov
pro-alkogolikov
narodnie
raznie
raznie
pro-militsiyu
pro-militsiyu
poshlie-i-intimnie
shk

In [43]:
dd['тема'].value_counts()

тема
raznie                   259
pro-alkogolikov          231
narodnie                 135
pro-militsiyu             84
aforizmi                  83
shkolnie-i-pro-shkolu     66
poshlie-i-intimnie        53
pro-studentov             36
pro-detey                 25
pro-armiu                 13
meditsinskie               5
pro-vovochku               4
tsitati                    3
pro-mugchin                2
pro-evreev                 1
Name: count, dtype: int64

In [None]:
data = ['Черчилль спрашивает Сталина что бы вы с вашими солдатами сделали с Гитлером, будь он у вас в руках? Сталин отвечает раскалил бы кочергу докрасна и засунул бы холодным концом ему в задницу. -А почему холодным, товарищ Сталин? -Чтобы вы, господин Черчилль, не помогли ему ее вытащить',
        'Приходит пациент к доктору и говорит: доктор, хуй чешется. Доктор отвечает: мой чаще. -Нет, мой',
        "— Вовочка, ты почему не сделал школьное домашнее задание? — А я в шахматы играл с папой."]
[model_classification.predict(text, device="cuda") for text in data]

narodnie
meditsinskie
pro-alkogolikov


[2, 1, 4]

In [7]:
import torch
import dill
import pandas as pd
import re
from nltk.tokenize import word_tokenize

class CustomTokenizer:
    def __init__(self, vocab_path):
        self.vocab = dill.load(open(vocab_path, "rb"))
        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"
        self.sep_token = "<SEP>"
        self.cls_token = "<CLS>"
        self.pad_token_id = self.vocab[self.pad_token]
        self.unk_token_id = self.vocab[self.unk_token]
        self.sep_token_id = self.vocab[self.sep_token]
        self.cls_token_id = len(self.vocab)

    def preprocess(self, text):
        import nltk
        import pymorphy3
        from nltk.corpus import stopwords
        nltk.download("stopwords")
        morph = pymorphy3.MorphAnalyzer()
        stop_words = set(stopwords.words("russian")) - {'не', 'ну', 'вот'}
        tokens = word_tokenize(text, language="russian")
        sep_tokens = {".", "?", "!", "..."}
        lemmas = []
        for token in tokens:
            if token in sep_tokens:
                lemmas.append("<SEP>")
            elif token.isalpha() and token not in stop_words:
                lemmas.append(morph.parse(token)[0].normal_form)
        return " ".join(lemmas)

    def __call__(self, texts, max_length=None, padding=True, truncation=True, return_tensors="pt"):
        if isinstance(texts, pd.DataFrame):
            texts = texts.iloc[:, 0].tolist()
        elif isinstance(texts, pd.Series):
            texts = texts.tolist()
        elif not isinstance(texts, list):
            raise TypeError("На вход должен подаваться list, Series или DataFrame с одним столбцом")

        processed = [self.preprocess(t) for t in texts]
        lengths = [len(s.split()) for s in processed]
        quantile99 = int(pd.DataFrame(lengths).quantile(0.99))
        max_length = max_length or quantile99

        token_ids = []
        attn_masks = []

        for text in processed:
            ids = [self.vocab.get(tok, self.unk_token_id) for tok in text.split()]
            if truncation and len(ids) > max_length:
                ids = ids[:max_length]
            if padding:
                pad_len = max_length - len(ids)
                ids = ids + [self.pad_token_id] * pad_len
            mask = [1 if i != self.pad_token_id else 0 for i in ids]
            token_ids.append(ids)
            attn_masks.append(mask)

        return {
            "input_ids": torch.tensor(token_ids),
            "attention_mask": torch.tensor(attn_masks)
        }

In [34]:
import torch
import dill
import pandas as pd
import re
from nltk.tokenize import word_tokenize

class CustomTokenizer_MLM:
    def __init__(self, vocab_path):
        self.vocab = dill.load(open(vocab_path, "rb"))
        self.vocab['<MASK>'] = len(self.vocab)
        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"
        self.mask_token = "<MASK>"
        self.sep_token = "<SEP>"
        self.cls_token = "<CLS>"
        self.pad_token_id = self.vocab[self.pad_token]
        self.unk_token_id = self.vocab[self.unk_token]
        self.mask_token_id = self.vocab[self.mask_token]
        self.sep_token_id = self.vocab[self.sep_token]
        self.cls_token_id = len(self.vocab)

    def preprocess(self, text):
        import nltk
        import pymorphy3
        from nltk.corpus import stopwords
        nltk.download("stopwords")
        morph = pymorphy3.MorphAnalyzer()
        stop_words = set(stopwords.words("russian")) - {'не', 'ну', 'вот'}
        text = re.sub(r"<MASK>", " MASKTOKEN ", text)
        text = re.sub(r"<SEP>", " SEPTOKEN ", text)
        tokens = word_tokenize(text, language="russian")
        sep_tokens = {".", "?", "!", "..."}
        lemmas = []
        for token in tokens:
            if token == "MASKTOKEN":
                lemmas.append("<MASK>")
            elif token in sep_tokens or token == "SEPTOKEN":
                lemmas.append("<SEP>")
            elif token.isalpha() and token not in stop_words:
                lemmas.append(morph.parse(token)[0].normal_form)
        return " ".join(lemmas)

    def __call__(self, texts, max_length=None, padding=True, truncation=True, return_tensors="pt"):
        if isinstance(texts, pd.DataFrame):
            texts = texts.iloc[:, 0].tolist()
        elif isinstance(texts, pd.Series):
            texts = texts.tolist()
        elif not isinstance(texts, list):
            raise TypeError("На вход должен подаваться list, Series или DataFrame с одним столбцом")

        processed = [self.preprocess(t) for t in texts]
        lengths = [len(s.split()) for s in processed]
        quantile99 = int(pd.DataFrame(lengths).quantile(0.99))
        max_length = max_length or quantile99

        token_ids = []
        attn_masks = []

        for text in processed:
            ids = [self.vocab.get(tok, self.unk_token_id) for tok in text.split()]
            if truncation and len(ids) > max_length:
                ids = ids[:max_length]
            if padding:
                pad_len = max_length - len(ids)
                ids = ids + [self.pad_token_id] * pad_len
            mask = [1 if i != self.pad_token_id else 0 for i in ids]
            token_ids.append(ids)
            attn_masks.append(mask)

        return {
            "input_ids": torch.tensor(token_ids),
            "attention_mask": torch.tensor(attn_masks)
        }

In [36]:
tokenizer = CustomTokenizer("anek_2ch_vocab_20000.pkl")
batch = tokenizer(["Я сегодня спал весь день.", "Хороший день!"])
print(batch["input_ids"], batch["attention_mask"])

tensor([[  18,  318, 3340,   26,   62],
        [  47,   62,    2,    0,    0]]) tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0]])


[nltk_data] Downloading package stopwords to C:\Users\wa-
[nltk_data]     Arakelyan_P\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\wa-
[nltk_data]     Arakelyan_P\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  quantile99 = int(pd.DataFrame(lengths).quantile(0.99))
