In [1]:
import re
import math
import random
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
df = pd.read_csv(
    "/content/drive/MyDrive/colab/Video_Games_5_part0.csv",
    on_bad_lines="skip"
)
df = df.dropna(subset=["reviewText", "overall"])

class_counts = df['overall'].value_counts()
print(f"Распределение классов:\n{class_counts}")

Распределение классов:
overall
5.0    284435
4.0     88407
3.0     46546
1.0     29138
2.0     22830
Name: count, dtype: int64


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
X = df["reviewText"].astype(str)
y = df["overall"].astype(int)

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.15,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.15 / 0.85,
    random_state=42,
    stratify=y_temp
)
print("Done")

Done


Я скачал word2vec из fasttext и оставил вектора только для слов, что есть в train

In [None]:
import pickle
with open("/content/ft_subset.pkl", "wb") as f:
    pickle.dump((word2vec, dim), f, protocol=pickle.HIGHEST_PROTOCOL)

2. Word2Vec + LSTM(с кастомным лоссом)

$$
\textbf{Идея 1 (учёт упорядоченности классов через soft-label).}\quad
q_{y,i}=\frac{\exp(-\tau\,|i-y|)}{\sum\limits_{j=1}^{5}\exp(-\tau\,|j-y|)},\qquad
L_1(x,y)=\sum_{i=1}^{5} q_{y,i}\,(-\log p_i).
$$

$$
\textbf{Идея 2 (учёт несбалансированности классов).}\quad
n_k=\#\{(x_j,y_j)\ \text{в train}:\ y_j=k\},\qquad
\tilde w_k=\frac{n}{n_k},\qquad
w_k=\frac{\tilde w_k}{\frac{1}{5}\sum\limits_{r=1}^{5}\tilde w_r},\qquad
L(x,y)= w_y \cdot L_1(x,y)=w_y\sum_{i=1}^{5} q_{y,i}\,(-\log p_i).
$$


In [9]:
import pickle

with open("/content/drive/MyDrive/colab/ft_subset.pkl", "rb") as f:
    word2vec, dim = pickle.load(f)
print(dim)

300


embedding hits: 98649 / 166416


Для слишком большого числа слов нет эмбединга я добавлю CNN по буквам на вход будет подаваться вектор полученный конкатенацией эмбединга и CNN.

Токенизация

In [10]:
def simple_word_tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9']+", " ", text)
    tokens = text.split()
    return tokens

PAD = "<pad>"
UNK = "<unk>"

def build_word_vocab(texts, min_freq=2, max_vocab=200000):
    from collections import Counter
    c = Counter()
    for t in texts:
        c.update(simple_word_tokenize(t))
    items = [(w,f) for w,f in c.items() if f >= min_freq]
    items.sort(key=lambda x: x[1], reverse=True)
    items = items[:max_vocab]

    stoi = {PAD: 0, UNK: 1}
    for w,_ in items:
        if w not in stoi:
            stoi[w] = len(stoi)
    itos = {i:w for w,i in stoi.items()}
    return stoi, itos, c

CPAD = "<cpad>"
CUNK = "<cunk>"

def build_char_vocab():
    chars = list("abcdefghijklmnopqrstuvwxyz0123456789'")
    stoi = {CPAD: 0, CUNK: 1}
    for ch in chars:
        stoi[ch] = len(stoi)
    itos = {i:ch for ch,i in stoi.items()}
    return stoi, itos

word_stoi, word_itos, word_freq = build_word_vocab(X_train.tolist(), min_freq=2, max_vocab=200000)
char_stoi, char_itos = build_char_vocab()

len(word_stoi), len(char_stoi)

(91016, 39)

Матрица эмбедингов

In [11]:
def get_vec(word):
    try:
        if word in word2vec:
            v = word2vec[word]
            return np.asarray(v, dtype=np.float32)
    except TypeError:
        pass
    return None

emb_dim = dim

embedding_matrix = np.random.normal(0, 0.05, size=(len(word_stoi), emb_dim)).astype(np.float32)
embedding_matrix[word_stoi[PAD]] = np.zeros(emb_dim, dtype=np.float32)

hits = 0
for w, idx in word_stoi.items():
    if w in (PAD, UNK):
        continue
    v = get_vec(w)
    if v is not None and v.shape[0] == emb_dim:
        embedding_matrix[idx] = v
        hits += 1

hits, len(word_stoi)


(66436, 91016)

Паддинг слов и символов

In [12]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, word_stoi, char_stoi):
        self.texts = list(texts)
        self.labels = list(labels)
        self.word_stoi = word_stoi
        self.char_stoi = char_stoi

    def __len__(self):
        return len(self.texts)

    def encode_word(self, w):
        return self.word_stoi.get(w, self.word_stoi[UNK])

    def encode_chars(self, w):
        return [self.char_stoi.get(ch, self.char_stoi[CUNK]) for ch in w]

    def __getitem__(self, i):
        text = self.texts[i]
        tokens = simple_word_tokenize(text)
        if len(tokens) == 0:
            tokens = [UNK]

        y = int(self.labels[i]) - 1  # 1..5 -> 0..4

        word_ids = [self.encode_word(w) for w in tokens]
        char_ids = [self.encode_chars(w) for w in tokens]

        return word_ids, char_ids, y

def make_collate_fn(max_tokens=200, max_char_len=16):
    def collate(batch):
        ys = torch.tensor([b[2] for b in batch], dtype=torch.long)

        word_seqs = [b[0][:max_tokens] for b in batch]
        char_seqs = [b[1][:max_tokens] for b in batch]

        lengths = torch.tensor([len(s) for s in word_seqs], dtype=torch.long)
        T = int(lengths.max().item()) if len(batch) > 0 else 0

        xw = torch.full((len(batch), T), fill_value=word_stoi[PAD], dtype=torch.long)
        xc = torch.full((len(batch), T, max_char_len), fill_value=char_stoi[CPAD], dtype=torch.long)

        for i,(ws, cs) in enumerate(zip(word_seqs, char_seqs)):
            xw[i, :len(ws)] = torch.tensor(ws, dtype=torch.long)
            for t, ch_list in enumerate(cs):
                ch_list = ch_list[:max_char_len]
                if len(ch_list) > 0:
                    xc[i, t, :len(ch_list)] = torch.tensor(ch_list, dtype=torch.long)

        return xw, xc, lengths, ys
    return collate

train_ds = ReviewDataset(X_train, y_train, word_stoi, char_stoi)
test_ds  = ReviewDataset(X_test, y_test, word_stoi, char_stoi)
val_ds = ReviewDataset(X_val, y_val, word_stoi, char_stoi)

collate_fn = make_collate_fn(max_tokens=200, max_char_len=16)

persistent_workers=True

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=128, shuffle=False, num_workers=0, collate_fn=collate_fn, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=0, collate_fn=collate_fn, pin_memory=True)


char-CNN

In [13]:
class CharCNN(nn.Module):
    def __init__(self, char_vocab_size, char_emb_dim=32, out_dim=64, kernel_size=3, dropout=0.1):
        super().__init__()
        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=char_stoi[CPAD])
        self.conv = nn.Conv1d(char_emb_dim, out_dim, kernel_size=kernel_size, padding=kernel_size//2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, xc):
        B, T, W = xc.shape
        x = self.char_emb(xc)           # (B,T,W,E)
        x = x.view(B*T, W, -1).transpose(1, 2)  # (B*T,E,W)
        x = F.relu(self.conv(x))        # (B*T, out_dim, W)
        x = torch.max(x, dim=2).values  # (B*T, out_dim)
        x = self.dropout(x)
        x = x.view(B, T, -1)            # (B,T,out_dim)
        return x

Word2Vec + char-CNN -> BiLSTM

In [14]:
class W2VCharBiLSTM(nn.Module):
    def __init__(self, embedding_matrix, char_vocab_size,
                 char_emb_dim=32, char_out_dim=64,
                 lstm_hidden=256, lstm_layers=1,
                 num_classes=5, dropout=0.2):
        super().__init__()
        V, D = embedding_matrix.shape

        self.word_emb = nn.Embedding(V, D, padding_idx=word_stoi[PAD])
        self.word_emb.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.word_emb.weight.requires_grad = False

        self.charcnn = CharCNN(char_vocab_size, char_emb_dim=char_emb_dim, out_dim=char_out_dim, dropout=dropout)

        in_dim = D + char_out_dim
        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2*lstm_hidden, num_classes)

    def forward(self, xw, xc, lengths):
        # xw: (B,T), xc: (B,T,W), lengths: (B,)
        we = self.word_emb(xw)          # (B,T,D)
        ce = self.charcnn(xc)           # (B,T,C)
        x = torch.cat([we, ce], dim=-1) # (B,T,D+C)
        x = self.dropout(x)

        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h_n, c_n) = self.lstm(packed)

        # h_n: (num_layers*2, B, H). Берём последний слой, оба направления
        h_fwd = h_n[-2]   # (B,H)
        h_bwd = h_n[-1]   # (B,H)
        h = torch.cat([h_fwd, h_bwd], dim=-1)  # (B,2H)

        h = self.dropout(h)
        logits = self.fc(h)             # (B,5)
        return logits


Лосс

In [15]:
class OrdinalImbalanceLoss(nn.Module):
    def __init__(self, class_counts, num_classes=5, tau=1.0, eps=1e-8, normalize_weights=True):
        super().__init__()
        n = float(sum(class_counts))

        w = torch.tensor([n / (c + eps) for c in class_counts], dtype=torch.float32)
        if normalize_weights:
            w = w / w.mean().clamp_min(eps)
        self.register_buffer("class_w", w)

        dist = torch.zeros(num_classes, num_classes, dtype=torch.float32)
        for y in range(num_classes):
            for i in range(num_classes):
                dist[y, i] = abs(i - y)

        q = torch.softmax(-tau * dist, dim=1)
        self.register_buffer("q", q)

    def forward(self, logits, y):
        logp = F.log_softmax(logits, dim=1)      # (B,5)
        qy = self.q[y]                           # (B,5)
        l1 = (qy * (-logp)).sum(dim=1)           # (B,)
        w = self.class_w[y]                      # (B,)
        return (w * l1).mean()


Почитаем по классам количество

In [16]:
counts = np.bincount((y_train.values - 1), minlength=5).tolist()
counts

[20396, 15981, 32582, 61885, 199104]

Модель

In [17]:
model = W2VCharBiLSTM(
    embedding_matrix=embedding_matrix,
    char_vocab_size=len(char_stoi),
    char_emb_dim=32,
    char_out_dim=64,
    lstm_hidden=256,
    lstm_layers=1,
    dropout=0.2
).to(device)

criterion = OrdinalImbalanceLoss(class_counts=counts, num_classes=5).to(device)
optimizer = torch.optim.Adam([p for p in model.parameters() if p.requires_grad], lr=1e-3)


Код для одной эпохи

In [19]:
train_losses, train_accs = [], []
val_losses, val_accs = [], []

def run_epoch_tqdm(model, loader, train=True):
    model.train(train)
    total_loss = 0.0
    all_pred, all_true = [], []

    pbar = tqdm(loader, unit="batch", leave=False)
    for xw, xc, lengths, y in pbar:
        xw, xc, lengths, y = xw.to(device), xc.to(device), lengths.to(device), y.to(device)

        if train:
            optimizer.zero_grad(set_to_none=True)

        logits = model(xw, xc, lengths)
        loss = criterion(logits, y)

        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()

        bs = y.size(0)
        total_loss += loss.item() * bs

        pred = torch.argmax(logits, dim=1).detach().cpu().numpy()
        true = y.detach().cpu().numpy()
        all_pred.append(pred)
        all_true.append(true)

        cur_loss = total_loss / max(1, (len(np.concatenate(all_true)) if len(all_true) else bs))
        cur_acc = (np.concatenate(all_pred) == np.concatenate(all_true)).mean()
        pbar.set_postfix(loss=f"{cur_loss:.4f}", acc=f"{cur_acc:.4f}")

    all_pred = np.concatenate(all_pred)
    all_true = np.concatenate(all_true)

    epoch_loss = total_loss / len(loader.dataset)
    epoch_acc = (all_pred == all_true).mean()
    return epoch_loss, epoch_acc

Обучение

In [20]:
num_epochs = 5

for epoch in range(1, num_epochs + 1):
    tr_loss, tr_acc = run_epoch_tqdm(model, train_loader, train=True)
    va_loss, va_acc = run_epoch_tqdm(model, val_loader, train=False)

    train_losses.append(tr_loss); train_accs.append(tr_acc)
    val_losses.append(va_loss);   val_accs.append(va_acc)

    print(f"epoch={epoch} train_loss={tr_loss:.4f} train_acc={tr_acc:.4f} val_loss={va_loss:.4f} val_acc={va_acc:.4f}")


  0%|          | 0/5156 [00:00<?, ?batch/s]

  0%|          | 0/553 [00:00<?, ?batch/s]

epoch=1 train_loss=0.6749 train_acc=0.5144 val_loss=0.6501 val_acc=0.5544


  0%|          | 0/5156 [00:00<?, ?batch/s]

  0%|          | 0/553 [00:00<?, ?batch/s]

epoch=2 train_loss=0.6477 train_acc=0.5965 val_loss=0.6418 val_acc=0.6376


  0%|          | 0/5156 [00:00<?, ?batch/s]

  0%|          | 0/553 [00:00<?, ?batch/s]

epoch=3 train_loss=0.6408 train_acc=0.6145 val_loss=0.6426 val_acc=0.6459


  0%|          | 0/5156 [00:00<?, ?batch/s]

  0%|          | 0/553 [00:00<?, ?batch/s]

epoch=4 train_loss=0.6356 train_acc=0.6270 val_loss=0.6360 val_acc=0.6376


  0%|          | 0/5156 [00:00<?, ?batch/s]

  0%|          | 0/553 [00:00<?, ?batch/s]

epoch=5 train_loss=0.6315 train_acc=0.6385 val_loss=0.6364 val_acc=0.6200


epoch=1 train_loss=0.6749 train_acc=0.5144 val_loss=0.6501 val_acc=0.5544

epoch=2 train_loss=0.6477 train_acc=0.5965 val_loss=0.6418 val_acc=0.6376

epoch=3 train_loss=0.6408 train_acc=0.6145 val_loss=0.6426 val_acc=0.6459

epoch=4 train_loss=0.6356 train_acc=0.6270 val_loss=0.6360 val_acc=0.6376

epoch=5 train_loss=0.6315 train_acc=0.6385 val_loss=0.6364 val_acc=0.6200


In [21]:
model.eval()

all_true = []
all_pred = []

with torch.no_grad():
    for xw, xc, lengths, y in test_loader:
        xw, xc, lengths = xw.to(device), xc.to(device), lengths.to(device)

        logits = model(xw, xc, lengths)
        pred = torch.argmax(logits, dim=1).cpu().numpy()

        all_pred.append(pred)
        all_true.append(y.numpy())

y_true = np.concatenate(all_true) + 1
y_pred = np.concatenate(all_pred) + 1

print(classification_report(y_true, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
mae_torch = (torch.as_tensor(y_true) - torch.as_tensor(y_pred)).abs().float().mean().item()
print("MAE:", mae_torch)

              precision    recall  f1-score   support

           1     0.4906    0.8083    0.6106      4371
           2     0.2790    0.3356    0.3047      3424
           3     0.3461    0.3760    0.3604      6982
           4     0.3698    0.4994    0.4249     13261
           5     0.8884    0.7057    0.7866     42666

    accuracy                         0.6229     70704
   macro avg     0.4748    0.5450    0.4974     70704
weighted avg     0.6835    0.6229    0.6425     70704

Confusion matrix:
 [[ 3533   511   180    69    78]
 [ 1483  1149   571   172    49]
 [  953  1482  2625  1507   415]
 [  403   550  2446  6622  3240]
 [  829   426  1762  9539 30110]]
MAE: 0.4994342625141144
