Этап 0-1. Подготовка данных и окружения

In [2]:
import random
import numpy as np
import torch

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(1)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


device(type='cuda')

In [7]:
import os
train_path = "./data/splits/train.txt"
val_path   = "./data/splits/val.txt"
test_path  = "./data/splits/test.txt"

def count_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return sum(1 for _ in f)

def head(path, n=5):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, line in zip(range(n), f):
            print(line.strip())

print("train:", count_lines(train_path))
print("val  :", count_lines(val_path))
print("test :", count_lines(test_path))
print("\n--- train examples ---")
head(train_path, 5)


train: 1278086
val  : 159761
test : 159761

--- train examples ---
non- fail. ali found cappachino mix. now, to mail rin a cappachino. this might not fit in the mail box.
i am at a cool party. talking about twitter. but work tomorrow... mirka says hi. maria too.
procastinated by doing up a new playlist for tcc. now off to nalinas for teh 18th birthday bash. 2forever
essay limit 2000 words. my count: 1999.
is excited


Vocab + токенизация

In [17]:
from src.tokenizer_utils import build_vocab, encode, decode
MAX_VOCAB_SIZE = 20000
MIN_FREQ = 3
token2id, id2token = build_vocab(train_path, max_vocab_size=MAX_VOCAB_SIZE, min_freq=MIN_FREQ)
vocab_size = len(token2id)

pad_id = token2id["<pad>"]
bos_id = token2id["<bos>"]
eos_id = token2id["<eos>"]
unk_id = token2id["<unk>"]

print("vocab_size:", vocab_size)
print("pad/bos/eos/unk:", pad_id, bos_id, eos_id, unk_id)

sample = "i am going to learn something new"
ids = encode(sample, token2id, add_bos=True, add_eos=True)
print("encoded:", ids[:20])
print("decoded:", decode(ids, id2token))


vocab_size: 20000
pad/bos/eos/unk: 0 2 3 1
encoded: [2, 4, 60, 44, 5, 874, 204, 70, 3]
decoded: i am going to learn something new


Dataset + DataLoader

In [19]:
from torch.utils.data import DataLoader
from src.next_token_dataset import NextTokenDataset, collate_fn

MAX_LEN = 64
BATCH_SIZE = 128

train_ds = NextTokenDataset(train_path, token2id, max_len=MAX_LEN)
val_ds   = NextTokenDataset(val_path, token2id, max_len=MAX_LEN)
test_ds  = NextTokenDataset(test_path, token2id, max_len=MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=lambda b: collate_fn(b, pad_id))
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=lambda b: collate_fn(b, pad_id))
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False,
                          collate_fn=lambda b: collate_fn(b, pad_id))

x, y = next(iter(train_loader))
print("input:", x.shape, x.dtype)
print("target:", y.shape, y.dtype)


input: torch.Size([128, 31]) torch.int64
target: torch.Size([128, 31]) torch.int64


Этап 2. LSTM модель

In [20]:
from src.lstm_model import LSTMNextToken

EMB_DIM = 128
HIDDEN_DIM = 128

model = LSTMNextToken(vocab_size=vocab_size, emb_dim=EMB_DIM, hidden_dim=HIDDEN_DIM, pad_id=pad_id).to(DEVICE)

n_params = sum(p.numel() for p in model.parameters())
print(f"params: {n_params}")

params: 5272096


Этап 3. Обучение + сохранение + Rouge метрика для LSTM + Тестирование LSTM

In [21]:
from src.lstm_train import train_model

n_epoch = 5
lr = 1e-3
save_path = "./models/lstm_best.pth"

train_model(
    model,
    train_loader,
    val_loader,
    token2id,
    id2token,
    n_epochs=n_epoch,
    lr=lr,
    save_path=save_path,
    rouge_batches=32,
    device=DEVICE,
)

100%|██████████| 9986/9986 [08:21<00:00, 19.93it/s]  


Epoch 1 | Train Loss: 5.3333 | Val Loss: 5.0652 | Val r-1: 0.0657 | Val r-2: 0.0088
-------Пример----------
PROMPT: exams
TARGET: here
PRED  : 
-------Пример----------
PROMPT: headache. not
TARGET: nice.
PRED  : sure
-------Пример----------
PROMPT: has an upset
TARGET: 
PRED  : for


100%|██████████| 9986/9986 [04:05<00:00, 40.61it/s]


Epoch 2 | Train Loss: 4.9890 | Val Loss: 4.9502 | Val r-1: 0.0742 | Val r-2: 0.0115


100%|██████████| 9986/9986 [03:53<00:00, 42.84it/s]


Epoch 3 | Train Loss: 4.8928 | Val Loss: 4.8951 | Val r-1: 0.0773 | Val r-2: 0.0106


100%|██████████| 9986/9986 [03:53<00:00, 42.74it/s]


Epoch 4 | Train Loss: 4.8358 | Val Loss: 4.8625 | Val r-1: 0.0803 | Val r-2: 0.0119


100%|██████████| 9986/9986 [03:59<00:00, 41.76it/s]


Epoch 5 | Train Loss: 4.7964 | Val Loss: 4.8390 | Val r-1: 0.0804 | Val r-2: 0.0114
-------Пример----------
PROMPT: exams
TARGET: here
PRED  : 
-------Пример----------
PROMPT: headache. not
TARGET: nice.
PRED  : a
-------Пример----------
PROMPT: has an upset
TARGET: 
PRED  : 
Best val Rouge2: 0.011892443165574178


In [22]:
import torch
from src.eval_lstm import evaluate_rouge
from src.lstm_model import LSTMNextToken

best = LSTMNextToken(vocab_size=vocab_size, emb_dim=EMB_DIM, hidden_dim=HIDDEN_DIM, pad_id=pad_id).to(DEVICE)
best.load_state_dict(torch.load(save_path, map_location=DEVICE))

val_r1, val_r2 = evaluate_rouge(best, val_loader, token2id, id2token, max_batches=100, q=0.25)
test_r1, test_r2 = evaluate_rouge(best, test_loader, token2id, id2token, max_batches=100, q=0.25)

print(f"LSTM VAL : ROUGE-1={val_r1:.4f} ROUGE-2={val_r2:.4f}")
print(f"LSTM TEST: ROUGE-1={test_r1:.4f} ROUGE-2={test_r2:.4f}")


LSTM VAL : ROUGE-1=0.0766 ROUGE-2=0.0110
LSTM TEST: ROUGE-1=0.0773 ROUGE-2=0.0123


In [265]:
import torch
import torch.nn.functional as F
from src.lstm_model import LSTMNextToken
from src.tokenizer_utils import encode, decode

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMNextToken(vocab_size=vocab_size, emb_dim=EMB_DIM, hidden_dim=HIDDEN_DIM, pad_id=pad_id).to(DEVICE)
model.load_state_dict(torch.load("./models/lstm_best.pth", map_location=DEVICE))
model.eval()

prompt_text = "watch youtube"
prompt_ids = encode(prompt_text, token2id, add_bos=True, add_eos=False)


In [None]:
import torch
from src.lstm_model import LSTMNextToken
model = LSTMNextToken(vocab_size=vocab_size, emb_dim=EMB_DIM, hidden_dim=HIDDEN_DIM, pad_id=pad_id).to(DEVICE)
model.load_state_dict(torch.load("./models/lstm_best.pth", map_location=DEVICE))
model.eval()
prompt = "watch youtube"
generated_full = model.evaluate(prompt, token2id, id2token, num_tokens=15)

print("PROMPT:", prompt)
print("GEN   :", generated_full)

In [374]:
def sample_next_id(next_logits, temperature=1.0, top_k=50):
    next_logits = next_logits / max(temperature, 1e-8)
    if top_k is None:
        return int(torch.argmax(next_logits).item())

    k = min(top_k, next_logits.numel())
    vals, idx = torch.topk(next_logits, k)
    probs = F.softmax(vals, dim=-1)
    choice = torch.multinomial(probs, num_samples=1).item()
    return int(idx[choice].item())

def generate_lstm(model, prefix_ids, max_new_tokens=20, temperature=1.0, top_k=50):
    ids = list(prefix_ids)
    for _ in range(max_new_tokens):
        x = torch.tensor([ids], dtype=torch.long, device=DEVICE)
        with torch.no_grad():
            logits = model(x)
            next_logits = logits[0, -1]
        next_id = sample_next_id(next_logits, temperature=temperature, top_k=top_k)
        ids.append(next_id)
        if next_id == eos_id:
            break
    return ids

import random
outs = []
for i in range(5): #5 разных генераций
    torch.manual_seed(random.randint(1, 1000))
    out_ids = generate_lstm(model, prompt_ids, max_new_tokens=10, temperature=1, top_k=50)
    outs.append(out_ids)

print("promt:", prompt_text)
for i, out_ids in enumerate(outs, 1):
    print(f"generated №{i}:", decode(out_ids, id2token))


promt: watch youtube
generated №1: watch youtube
generated №2: watch youtube videos
generated №3: watch youtube so on the now i am sick and hungry
generated №4: watch youtube and a good with
generated №5: watch youtube from the show the new trailer show so im not


Этап 4. distilgpt2 + ROUGE

In [375]:
from src.eval_transformer_pipeline import evaluate_transformer

gpt2_val_r1, gpt2_val_r2, gpt2_val_n = evaluate_transformer(val_path, device=-1, max_samples=200, num_print=5)
print(f"distilgpt2 VAL: n={gpt2_val_n} ROUGE-1={gpt2_val_r1:.4f} ROUGE-2={gpt2_val_r2:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: is picking up her new passport today... watch ou
Target: world - im back
Prediction: , i am going in the middle of the day
ROUGE-1: 0.0000 | ROUGE-2: 0.0000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: broddddddddddy we love you your unreal lt;3
Target: danni and sash
Prediction: e1e2e3e3e3
ROUGE-1: 0.0000 | ROUGE-2: 0.0000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: just finished fallout 3 - great game! felt cheated at the end though. i had
Target: enough rad-away to make it
Prediction: tried to save 2 - i think it was a
ROUGE-1: 0.2667 | ROUGE-2: 0.0000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: wow, just called a bit dissapointed! been a customer for 9 years now.
Target: maybe not for much longer
Prediction: The experience and the quality. The food was good
ROUGE-1: 0.0000 | ROUGE-2: 0.0000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: i wonder if cubicle 7 is still coming out with the doctor who rpg. its
Target: not mentioned on their forums anymore.
Prediction: the most popular thing today.

The problem with the
ROUGE-1: 0.0000 | ROUGE-2: 0.0000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

distilgpt2 VAL: n=139 ROUGE-1=0.0501 ROUGE-2=0.0026


Этап 5. Сравнение + выводы

In [377]:
import pandas as pd

results = pd.DataFrame([
    {"model": "LSTM", "split": "val",  "rouge1": float(val_r1),      "rouge2": float(val_r2)},
    {"model": "LSTM", "split": "test", "rouge1": float(test_r1),     "rouge2": float(test_r2)},
    {"model": "distilgpt2",         "split": "val",  "rouge1": float(gpt2_val_r1), "rouge2": float(gpt2_val_r2)},
])

results

Unnamed: 0,model,split,rouge1,rouge2
0,LSTM,val,0.076582,0.011046
1,LSTM,test,0.07729,0.012269
2,distilgpt2,val,0.050075,0.002638


Выводы:

1. Обученная LSTM модель для автодополнения текста в сравнении с предобученным disitlgpt2 показывает более высокие значения ROUGE-1/ROUGE-2 (F1 score) на валидации и тесте, тем самым чаще воиспрозводила те же слова/биграммы, что и в реальном продолжение текста.
2. Более низкий ROUGE у distilgpt2 можно объяснить тем, что модель не была предобучена на выбранном датасете, поэтому её продолжения чаще не совпадают с реальными окончаниям слов/биграмм.
3. С учётом требования запускать модель на мобильных устройствах рекомендованно использовать LSTM как более легковесную и быструю модель. distilgpt2 стоит использовать при высоких требованиях к качеству и связности текста генерации.