In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms
import torch.optim as optim
import random 
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
import re
import math
from nltk.tokenize import word_tokenize
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/vuda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
NUM_EPOCHS = 30
BATCH_SIZE = 4
FRAC_SAMPLE = 0.01
MAX_LENGTH_ARTICLE = 256
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 0.001
NUM_CYCLES = 3
MAX_PLATEAU_COUNT = 5
WEIGHT_DECAY = 1e-4
CLIP = 1
USE_PRETRAINED_EMB = True
USE_SCHEDULER = True
SCHEDULER_TYPE = "plateau"  # hoặc cosine, linear
TEACHER_FORCING_RATIO = 0.75
NUM_CYCLES = 3
MAX_PLATEAU_COUNT = 5


model_dir = "../Model"
datafilter = "../dataft"
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac=FRAC_SAMPLE, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 153 entries, 38877 to 238849
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  153 non-null    object
 1   articles            153 non-null    object
 2   summaries           153 non-null    object
 3   article_word_count  153 non-null    int64 
 4   summary_word_count  153 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 7.2+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 7709 to 12342
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  8 non-null      object
 1   articles            8 non-null      object
 2   summaries           8 non-null      object
 3   article_word_count  8 non-null      int64 
 4   summary_word_count  8 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 384.0+ bytes


In [4]:
# print(tokenize("A dog. in a 'tree with 5.3% rate drop"))

In [5]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  153 non-null    object
 1   articles            153 non-null    object
 2   summaries           153 non-null    object
 3   article_word_count  153 non-null    int64 
 4   summary_word_count  153 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 6.1+ KB


In [6]:
TOP_K = 100000
EMBEDDING_FILE = "../Embedding/glove.6B.50d.txt"

vocab, embeddings = [], []

with open(EMBEDDING_FILE, 'rt', encoding='utf-8') as ef:
    for i, line in enumerate(ef):
        if i >= TOP_K:
            break
        split_line = line.strip().split(' ')
        i_word = split_line[0]
        i_embeddings = [float(val) for val in split_line[1:]]
        i_embeddings.extend([0.0, 0.0, 0.0, 0.0])  # để dành cho token đặc biệt
        vocab.append(i_word)
        embeddings.append(i_embeddings)


embs_npa = np.array(embeddings)

unk_embedding = np.mean(embs_npa, axis=0).tolist()

dim = embs_npa.shape[1]
sos_embedding = [0.0] * dim
sos_embedding[-3] = 1.0
eos_embedding = [0.0] * dim
eos_embedding[-2] = 1.0
pad_embedding = [0.0] * dim
pad_embedding[-4] = 1.0
# unk_embedding = [0.0] * dim
# unk_embedding[-1] = 1.0

# Update vocab and embeddings
vocab = ["<PAD>", "<SOS>", "<EOS>", "<UNK>"] + vocab
embeddings = [pad_embedding, sos_embedding,
              eos_embedding, unk_embedding] + embeddings

vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)


def tokenize(text):
    return word_tokenize(text.lower())

def preclean_text(text):
    text = re.sub(r"\s'([a-zA-Z])", r" '\1", text)

    return word_tokenize(text.lower())


stoi_dict = {word: idx for idx, word in enumerate(vocab_npa)}
_unk_idx = stoi_dict["<UNK>"]
itos_dict = {idx: word for idx, word in enumerate(vocab_npa)}

def stoi(string, stoi_dict=stoi_dict):
    return stoi_dict.get(string, _unk_idx)

def itos(idx, itos_dict=itos_dict):
    return itos_dict.get(idx)

def revert_to_text(lst):
    if hasattr(lst, 'tolist'):  # works for both torch.Tensor and np.ndarray
        lst = lst.tolist()
    return [str(itos(int(token))) for token in lst] 


def numericalize(text):
    tokenized_text = tokenize(text)
    return [
        stoi(token)
        for token in tokenized_text
    ]

print(embs_npa.shape[0])
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embeddings),
                                                     freeze=False,
                                                     padding_idx=stoi("<PAD>"))
embedding_layer.to(device)



100004


Embedding(100004, 54, padding_idx=0)

In [7]:
vocab_size = len(vocab_npa)
print("Embedding shape:", np.array(embeddings).shape) 
print("<PAD> embedding last 4 dims:", embeddings[stoi("<PAD>")][-4:])
print("<SOS> embedding last 4 dims:", embeddings[stoi("<SOS>")][-4:])
print("Word 'the' embedding last 4 dims:", embeddings[stoi("5.3%")])
print(revert_to_text(torch.tensor([0, 1, 2, 3])))

Embedding shape: (100004, 54)
<PAD> embedding last 4 dims: [1.0, 0.0, 0.0, 0.0]
<SOS> embedding last 4 dims: [0.0, 1.0, 0.0, 0.0]
Word 'the' embedding last 4 dims: [0.03656563577138949, -0.18582784663417096, -0.06781056216307739, -0.040585373685092956, -0.07301668493291918, 0.016044644258671987, 0.1372867005388796, -0.07036971064271462, -0.10438737653424474, 0.11584013095302573, 0.03797717741161136, 0.02782290565024015, 0.042134458545775105, 0.006004885678202956, -0.04811175801439264, 0.01938162005008408, -0.06738358746789967, 0.07091858680646929, 0.0485902582917709, 0.12120493521288776, -0.11500593603289162, 0.003967934351382163, 0.05070292498116329, 0.005906024159269963, 0.06472339142162035, 0.022294008330708867, 0.009735554614555037, 0.08725065600012115, -0.042772991751960865, -0.07584176364629841, 0.03525691425508773, 0.04915543195526088, 0.05126815078068994, 0.1762973806305546, 0.02469765032816606, 0.01257729470118004, -0.015610369137945711, 0.016562756680375525, 0.036024580951118

In [8]:
from collections import defaultdict

def analyze_vocab_coverage(sample_data, stoi_dict):
    # Đếm tần suất từ duy nhất
    word_freq = defaultdict(int)

    for text in sample_data['articles'] + sample_data['summaries']:
        tokens = tokenize(text)
        for token in tokens:
            word_freq[token] += 1

    # Phân loại từ vào known / unknown
    known_words = set()
    unknown_words = set()

    for word in word_freq:
        if word in stoi_dict:
            known_words.add(word)
        else:
            unknown_words.add(word)

    total_unique_words = len(known_words) + len(unknown_words)
    coverage = len(known_words) / total_unique_words * 100 if total_unique_words > 0 else 0.0
    print("A word not in dict: ", random.choice(list(unknown_words)))
    return {
        'total_unique_words': total_unique_words,
        'known_unique_words': len(known_words),
        'unknown_unique_words': len(unknown_words),
        'coverage_percentage': coverage,
    }
def print_vocab_stats(name, stats):
    print(f"\n{name} Vocabulary Coverage:")
    print(f"- Unique words: {stats['total_unique_words']}")
    print(f"- Exist in dict: {stats['known_unique_words']}")
    print(f"- Outside the dict: {stats['unknown_unique_words']}")
    print(f"- Coverage rate: {stats['coverage_percentage']:.2f}%")

print_vocab_stats("Train", analyze_vocab_coverage(train_sample, stoi_dict))
print_vocab_stats("Validation", analyze_vocab_coverage(validation_sample, stoi_dict))
print_vocab_stats("Test", analyze_vocab_coverage(test_sample, stoi_dict))


A word not in dict:  kurdin

Train Vocabulary Coverage:
- Unique words: 6922
- Exist in dict: 6169
- Outside the dict: 753
- Coverage rate: 89.12%
A word not in dict:  markchapman

Validation Vocabulary Coverage:
- Unique words: 838
- Exist in dict: 771
- Outside the dict: 67
- Coverage rate: 92.00%
A word not in dict:  pic.twitter.com/fg0p2yf5uf

Test Vocabulary Coverage:
- Unique words: 695
- Exist in dict: 662
- Outside the dict: 33
- Coverage rate: 95.25%


In [9]:
class Seq2SeqDataset(Dataset):
    def __init__(self, articles, summaries, stoi, max_len_article=MAX_LENGTH_ARTICLE, max_len_summary=MAX_LENGTH_SUMMARY):
        self.articles = articles  # List of articles
        self.summaries = summaries  # List of summaries
        self.stoi = stoi 
        self.pad_idx = stoi("<PAD>")
        self.sos_idx = stoi("<SOS>")
        self.eos_idx = stoi("<EOS>")
        
        self.max_len_article = max_len_article 
        self.max_len_summary = max_len_summary

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        def process_text(text, max_len):
            tokens = [self.sos_idx] + [self.stoi(w) for w in text.split()] + [self.eos_idx]  # Tokenize and add SOS/EOS
            tokens = tokens[:max_len] + [self.pad_idx] * (max_len - len(tokens))  # Pad to max length
            return torch.tensor(tokens), len(tokens)

        article_tokens, article_len = process_text(self.articles[idx], self.max_len_article)
        summary_tokens, summary_len = process_text(self.summaries[idx], self.max_len_summary)
        
        return {
            'article': article_tokens,  # Encoded article
            'article_len': torch.tensor(article_len),
            'summary': summary_tokens,  # Encoded summary
            'summary_len': torch.tensor(summary_len)
        }

def collate_fn(batch):
    # Batch is list os the dict {'article': ..., 'summary': ...}
    return {
        'article': torch.stack([item['article'] for item in batch]),
        'article_len': torch.tensor([item['article_len'] for item in batch]),
        'summary': torch.stack([item['summary'] for item in batch]),
        'summary_len': torch.tensor([item['summary_len'] for item in batch])
    }

# DataLoader setup

torch.set_printoptions(profile="default")
train_dataset = Seq2SeqDataset(train_sample['articles'].tolist(), train_sample['summaries'].tolist(), stoi)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

validation_dataset= Seq2SeqDataset(validation_sample['articles'].tolist(), validation_sample['summaries'].tolist(), stoi)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

test_dataset= Seq2SeqDataset(test_sample['articles'].tolist(), test_sample['summaries'].tolist(), stoi)
test_loader = DataLoader(test_sample, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# torch.set_printoptions(profile="full")
# print(train_dataset[268]["article"])

In [19]:
class SimpleEncoder(nn.Module):
    def __init__(self, embedding_layer, enc_hid_dim, dec_hid_dim, dropout=0.5):
        super().__init__()
        self.embedding = embedding_layer
        self.rnn = nn.GRU(self.embedding.embedding_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):
        embedded = self.dropout(self.embedding(src))
        embedded = embedded.permute(1, 0, 2)

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.cpu(), enforce_sorted=False)
        packed_outputs, hidden = self.rnn(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2], hidden[-1]), dim=1)))
        return outputs, hidden

class SimpleAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask):
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        attention = attention.masked_fill(mask == 0, -1e10)

        return F.softmax(attention, dim=1)

class SimpleDecoder(nn.Module):
    def __init__(self, output_dim, embedding_layer, enc_hid_dim, dec_hid_dim, attention, dropout=0.5):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = embedding_layer
        self.rnn = nn.GRU(enc_hid_dim * 2 + self.embedding.embedding_dim, dec_hid_dim)
        self.fc_out = nn.Linear(enc_hid_dim * 2 + dec_hid_dim + self.embedding.embedding_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs, mask):
        if input.dim() == 1:
            input = input.unsqueeze(1)

        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs, mask)
        
        # NEW: Attention score checks
        if torch.isnan(a).any():
            print("WARNING: NaN in attention scores!")
        if (a.sum(dim=1) - 1.0).abs().max() > 1e-3:
            print("WARNING: Attention scores don't sum to 1!")
        
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)

        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        if hidden.dim() == 2:
            hidden = hidden.unsqueeze(0)
        
        output, hidden = self.rnn(rnn_input, hidden)
        
        embedded = embedded.squeeze(1)
        output = output.squeeze(1)
        weighted = weighted.squeeze(1)

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden.squeeze(0), a.squeeze(1)

class Seq2SeqModel(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device

    def create_mask(self, src):
        mask = (src != self.pad_idx)
        return mask

    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        output_dim = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)
        
        # NEW: Input validation
        print(f"Input src min/max: {src.min()}/{src.max()}")  # Should be within vocab range
        print(f"Input trg min/max: {trg.min()}/{trg.max()}")

        encoder_outputs, hidden = self.encoder(src, src_len)
        input = trg[:, 0]
        mask = self.create_mask(src)

        for t in range(1, trg_len):
            output, hidden, attn_weights = self.decoder(input, hidden, encoder_outputs, mask)
            outputs[:, t] = output
            
            # NEW: Generation monitoring
            top1 = output.argmax(1)
            print(f"Step {t}: Generated token IDs: {top1}")
            print(f"Attention weights mean: {attn_weights.mean().item():.4f}")
            
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[:, t] if teacher_force else top1

        return outputs

In [22]:
print(f"GPU Memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"GPU Memory reserved: {torch.cuda.memory_reserved()/1024**2:.2f} MB")

GPU Memory allocated: 761.93 MB
GPU Memory reserved: 770.00 MB


In [12]:
print("Embedding shape:", torch.FloatTensor(embeddings).shape)
print("Vocab size:", vocab_size)
print("Model architecture:")
print(model)


Embedding shape: torch.Size([100004, 54])
Vocab size: 100004
Model architecture:
Seq2SeqModel(
  (encoder): SimpleEncoder(
    (embedding): Embedding(100004, 54, padding_idx=0)
    (rnn): GRU(54, 128, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): SimpleDecoder(
    (attention): SimpleAttention(
      (attn): Linear(in_features=384, out_features=128, bias=True)
      (v): Linear(in_features=128, out_features=1, bias=False)
    )
    (embedding): Embedding(100004, 54, padding_idx=0)
    (rnn): GRU(310, 128)
    (fc_out): Linear(in_features=438, out_features=100004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [13]:
wandb.init(
    project="Seq2Seq-Summarization",
    name=f"seq2seq-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    config={
        "model": "Seq2Seq-LSTM",
        "hidden_dim": HIDDEN_DIM,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "teacher_forcing_ratio": 1.0,
        "vocab_size": len(vocab)
    }
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvubkk67[0m ([33mvubkk67-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
def linear_warmup_decay(step, warmup_steps, total_steps):
    if step < warmup_steps:
        return (step + 1) / (warmup_steps + 1)
    else:
        return max(1e-7, (total_steps - step) / (total_steps - warmup_steps))


def warmup_cosine_with_restarts(step, warmup_steps, total_steps, num_cycles=1):
    if step < warmup_steps:
        return (step + 1) / (warmup_steps + 1)
    else:
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        cycle_progress = progress * num_cycles % 1
        return max(1e-7, 0.5 * (1 + math.cos(math.pi * cycle_progress)))



def get_scheduler(
    optimizer, total_steps, warmup_steps, num_cycles=None, types='warmup_cosine_with_restarts'
):
    if types == 'warmup_cosine_with_restarts':
        assert num_cycles != None, 'must specify num_cycles when types="warmup_cosine_with_restarts"'
        return torch.optim.lr_scheduler.LambdaLR(
            optimizer,
            lr_lambda=lambda step: warmup_cosine_with_restarts(
                step, warmup_steps, total_steps, num_cycles=num_cycles)
        )
    elif types == 'linear_warmup_decay':
        return torch.optim.lr_scheduler.LambdaLR(
            optimizer,
            lr_lambda=lambda step: linear_warmup_decay(step, warmup_steps, total_steps)
        )
    else:
        raise Exception('not implemented')

In [15]:
class CustomCrossEntropyLoss(nn.Module):
    def __init__(self, ignore_idxs):
        super().__init__()
        self.ignore_idxs = ignore_idxs
        self.loss_fn = nn.CrossEntropyLoss(reduction='none')  # để tính loss từng phần tử

    def forward(self, output, target):
        # output: [B*T, vocab_size], target: [B*T]
        loss = self.loss_fn(output, target)  # [B*T]
        mask = ~torch.isin(target, torch.tensor(self.ignore_idxs, device=target.device))  # giữ lại nếu không phải ignore
        loss = loss[mask]
        return loss.mean()  # trả về mean loss
PAD_IDX = stoi("<PAD>")
UNK_IDX = stoi("<UNK>")
criterion = CustomCrossEntropyLoss(ignore_idxs=[PAD_IDX, UNK_IDX])

In [25]:
# 3. Khởi tạo model
attn = SimpleAttention(enc_hid_dim=HIDDEN_DIM, dec_hid_dim=HIDDEN_DIM)
encoder = SimpleEncoder(embedding_layer, HIDDEN_DIM, HIDDEN_DIM)
decoder = SimpleDecoder(vocab_size, embedding_layer, HIDDEN_DIM, HIDDEN_DIM, attn)
model = Seq2SeqModel(encoder, decoder, PAD_IDX, device).to(device)

# 4. Khởi tạo loss function và optimizer
criterion = CustomCrossEntropyLoss(ignore_idxs=[PAD_IDX])  # Chỉ ignore PAD tokens
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 5. Hàm train và evaluate cải tiến
def train(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    epoch_loss = 0
    total_tokens = 0
    unk_count = 0
    
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch:02d} [Train]", leave=False)
    for batch in progress_bar:
        src, src_len = batch['article'].to(device), batch['article_len'].to(device)
        trg, trg_len = batch['summary'].to(device), batch['summary_len'].to(device)
        
        optimizer.zero_grad()
        output = model(src, src_len, trg, TEACHER_FORCING_RATIO)
        
        # Tính toán loss
        output = output[:, 1:].reshape(-1, output.shape[-1])
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        
        # Theo dõi UNK tokens
        preds = output.argmax(1)
        unk_count += (preds == UNK_IDX).sum().item()
        total_tokens += preds.size(0)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item(), unk_ratio=f"{unk_count/total_tokens:.2%}")
    
    return epoch_loss / len(dataloader), unk_count / total_tokens

def evaluate(model, dataloader, criterion, device, epoch):
    model.eval()
    epoch_loss = 0
    unk_count = 0
    total_tokens = 0
    
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch:02d} [Eval]", leave=False)
        for batch in progress_bar:
            src, src_len = batch['article'].to(device), batch['article_len'].to(device)
            trg, trg_len = batch['summary'].to(device), batch['summary_len'].to(device)
            
            output = model(src, src_len, trg, 0)  # Không dùng teacher forcing khi eval
            
            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            
            preds = output.argmax(1)
            unk_count += (preds == UNK_IDX).sum().item()
            total_tokens += preds.size(0)
            
            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item(), unk_ratio=f"{unk_count/total_tokens:.2%}")
    
    return epoch_loss / len(dataloader), unk_count / total_tokens

# 6. Vòng lặp training
best_val_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    train_loss, train_unk = train(model, train_loader, optimizer, criterion, device, epoch)
    val_loss, val_unk = evaluate(model, validation_loader, criterion, device, epoch)
    
    # Lưu model tốt nhất
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': val_loss,
        }, 'best_model.pt')
    
    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} (UNK: {train_unk:.2%}) | Val Loss: {val_loss:.4f} (UNK: {val_unk:.2%})")

    # Early stopping nếu loss không giảm
    if val_loss > best_val_loss * 1.5:  # Cho phép tăng 50%
        print("Validation loss increased significantly. Stopping training.")
        break

# 7. Test model
test_loss, test_unk = evaluate(model, test_loader, criterion, device, "Test")
print(f"Final Test Loss: {test_loss:.4f} | UNK Ratio: {test_unk:.2%}")

Epoch 00 [Train]:   0%|          | 0/39 [00:00<?, ?it/s]

Input src min/max: 0/65195
Input trg min/max: 0/65195


RuntimeError: Expected hidden size (1, 1, 128), got [1, 4, 128]

In [17]:
# # === INFERENCE FUNCTION ===
# def generate_summary(model, text, stoi, itos, max_len=MAX_LENGTH_SUMMARY):
#     model.eval()
#     tokens = tokenize(text)
#     tokens = ['<SOS>'] + tokens + ['<EOS>']
#     token_ids = [stoi(token) for token in tokens]
#     token_tensor = torch.LongTensor(token_ids).unsqueeze(0).to(device)  # [1, seq_len]
#     token_len = torch.LongTensor([len(token_ids)]).to(device)

#     with torch.no_grad():
#         encoder_outputs, hidden = model.encoder(token_tensor, token_len)
    
#     mask = (token_tensor != stoi('<PAD>')).to(device)

#     generated_ids = [stoi('<SOS>')]
#     attentions = []

#     for _ in range(max_len):
#         last_token = torch.LongTensor([generated_ids[-1]]).unsqueeze(0).to(device)  # [1, 1]

#         with torch.no_grad():
#             output, hidden, attention = model.decoder(last_token, hidden, encoder_outputs, mask)
        
#         pred_id = output.argmax(dim=1).item()
#         generated_ids.append(pred_id)
#         attentions.append(attention.squeeze(0).cpu())

#         if pred_id == stoi('<EOS>'):
#             break

#     summary_tokens = [itos(idx) for idx in generated_ids[1:-1]]  # exclude <SOS> and <EOS>
#     return ' '.join(summary_tokens), attentions


In [18]:
checkpoint = torch.load(f"{model_dir}/best_model.pth", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()
print("stoi tokens:", [stoi(tok) for tok in tokenize("The quick brown fox")])
test_article = "The quick brown fox jumps over the lazy dog near the river bank in the forest."
summary, attention_weights = generate_summary(
    model=model,
    text=test_article,
    stoi=stoi,
    itos=itos,
    max_len=MAX_LENGTH_SUMMARY
)

print("📰 Article:")
print(test_article)
print("\n📝 Summary:")
print(summary)


  checkpoint = torch.load(f"{model_dir}/best_model.pth", map_location=device)


stoi tokens: [4, 2586, 1046, 2110]


NameError: name 'generate_summary' is not defined