In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms
import torch.optim as optim
import random 
from tqdm import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from tqdm.auto import tqdm
from datetime import datetime
import wandb
import time
import os
import re
import math
from nltk.tokenize import word_tokenize
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/vuda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
NUM_EPOCHS = 30
BATCH_SIZE = 4
FRAC_SAMPLE = 0.2
MAX_LENGTH_ARTICLE = 512
MIN_LENGTH_ARTICLE = 50
MAX_LENGTH_SUMMARY = 128
MIN_LENGTH_SUMMARY = 20
HIDDEN_DIM = 128
LEARNING_RATE = 0.001
NUM_CYCLES = 3
MAX_PLATEAU_COUNT = 5
WEIGHT_DECAY = 1e-4
CLIP = 1
USE_PRETRAINED_EMB = True
USE_SCHEDULER = True
SCHEDULER_TYPE = "plateau"  # hoặc cosine, linear
TEACHER_FORCING_RATIO = 0.75
NUM_CYCLES = 3
MAX_PLATEAU_COUNT = 5


model_dir = "../Model"
datafilter = "../dataft"
os.makedirs(datafilter, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

# add col
train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)

train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

# filter range
train_data = train_data[
    (train_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (train_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (train_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (train_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

validation_data = validation_data[
    (validation_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (validation_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (validation_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (validation_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]
test_data = test_data[
    (test_data["article_word_count"] <= MAX_LENGTH_ARTICLE) & 
    (test_data["article_word_count"] >= MIN_LENGTH_ARTICLE) &
    (test_data["summary_word_count"] <= MAX_LENGTH_SUMMARY) &
    (test_data["summary_word_count"] >= MIN_LENGTH_SUMMARY)
]

train_sample = train_data.sample(frac=FRAC_SAMPLE, random_state=1)
validation_sample = validation_data.sample(frac=FRAC_SAMPLE, random_state=1)
test_sample = test_data.sample(frac=1, random_state=1)
train_sample.info()
print("\n")
validation_sample.info()
train_sample.to_csv(os.path.join(datafilter,"train_sample.csv"), index=False)
test_sample.to_csv(os.path.join(datafilter,"test_sample.csv"), index=False)
validation_sample.to_csv(os.path.join(datafilter,"validation_sample.csv"), index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 19198 entries, 144417 to 201560
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  19198 non-null  object
 1   articles            19198 non-null  object
 2   summaries           19198 non-null  object
 3   article_word_count  19198 non-null  int64 
 4   summary_word_count  19198 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 899.9+ KB


<class 'pandas.core.frame.DataFrame'>
Index: 994 entries, 8901 to 8365
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  994 non-null    object
 1   articles            994 non-null    object
 2   summaries           994 non-null    object
 3   article_word_count  994 non-null    int64 
 4   summary_word_count  994 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 46.6+ KB


In [4]:
# print(tokenize("A dog. in a 'tree with 5.3% rate drop"))

In [5]:
train_sample = pd.read_csv("../dataft/train_sample.csv")
validation_sample = pd.read_csv("../dataft/validation_sample.csv")
test_sample = pd.read_csv("../dataft/test_sample.csv")
train_sample.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19198 entries, 0 to 19197
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  19198 non-null  object
 1   articles            19198 non-null  object
 2   summaries           19198 non-null  object
 3   article_word_count  19198 non-null  int64 
 4   summary_word_count  19198 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 750.1+ KB


In [6]:
TOP_K = 400500
EMBEDDING_FILE = "../Embedding/glove.6B.50d.txt"
# EMBEDDING_FILE = "../Embedding/glove-wiki-gigaword-100.txt"

vocab, embeddings = [], []

with open(EMBEDDING_FILE, 'rt', encoding='utf-8') as ef:
    for i, line in enumerate(ef):
        if i >= TOP_K:
            break
        split_line = line.strip().split(' ')
        i_word = split_line[0]
        i_embeddings = [float(val) for val in split_line[1:]]
        i_embeddings.extend([0.0, 0.0, 0.0, 0.0])  # để dành cho token đặc biệt
        vocab.append(i_word)
        embeddings.append(i_embeddings)


embs_npa = np.array(embeddings)

unk_embedding = np.mean(embs_npa, axis=0).tolist()

dim = embs_npa.shape[1]
sos_embedding = [0.0] * dim
sos_embedding[-3] = 1.0
eos_embedding = [0.0] * dim
eos_embedding[-2] = 1.0
pad_embedding = [0.0] * dim
pad_embedding[-4] = 1.0
# unk_embedding = [0.0] * dim
# unk_embedding[-1] = 1.0

# Update vocab and embeddings
vocab = ["<PAD>", "<SOS>", "<EOS>", "<UNK>"] + vocab
embeddings = [pad_embedding, sos_embedding,
              eos_embedding, unk_embedding] + embeddings

vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)


def tokenize(text):
    return word_tokenize(text)
    return (text.split())

def preclean_text(text):
    text = re.sub(r"\s'([a-zA-Z])", r" '\1", text)

    return word_tokenize(text)
    # return text


stoi_dict = {word: idx for idx, word in enumerate(vocab_npa)}
_unk_idx = stoi_dict["<UNK>"]
itos_dict = {idx: word for idx, word in enumerate(vocab_npa)}

def stoi(string, stoi_dict=stoi_dict):
    return stoi_dict.get(string, _unk_idx)

def itos(idx, itos_dict=itos_dict):
    return itos_dict.get(idx)

def revert_to_text(lst):
    if hasattr(lst, 'tolist'):  # works for both torch.Tensor and np.ndarray
        lst = lst.tolist()
    return [str(itos(int(token))) for token in lst] 


def numericalize(text):
    tokenized_text = tokenize(text)
    return [
        stoi(token)
        for token in tokenized_text
    ]

print(embs_npa.shape[0])
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embeddings),
                                                     freeze=False,
                                                     padding_idx=stoi("<PAD>"))
embedding_layer.to(device)



400004


Embedding(400004, 54, padding_idx=0)

In [7]:
vocab_size = len(vocab_npa)
print("Embedding shape:", np.array(embeddings).shape) 
print("<PAD> embedding last 4 dims:", embeddings[stoi("<PAD>")][-4:])
print("<SOS> embedding last 4 dims:", embeddings[stoi("<SOS>")][-4:])
print("Word 'the' embedding last 4 dims:", embeddings[stoi("5.3%")])
print(revert_to_text(torch.tensor([0, 1, 2, 3])))

Embedding shape: (400004, 54)
<PAD> embedding last 4 dims: [1.0, 0.0, 0.0, 0.0]
<SOS> embedding last 4 dims: [0.0, 1.0, 0.0, 0.0]
Word 'the' embedding last 4 dims: [-0.12920060864176852, -0.28866239452097603, -0.012248941299726332, -0.056766888363689434, -0.202111085482792, -0.08389026443356357, 0.3335973726965789, 0.1604514588494143, 0.03867495354970917, 0.17833092082704793, 0.0469662038533105, -0.0028577895152307304, 0.29099850796744287, 0.046137231761455566, -0.20923841559858444, -0.06613100298669164, -0.06822448421043388, 0.07665884568148376, 0.31339918388268906, 0.17848512473276362, -0.12257719082558292, -0.09916927562478682, -0.07495972834085389, 0.06413205706058327, 0.1444125551281154, 0.6089459982604638, 0.17463101054296204, 0.05335403311654184, -0.012738255533159106, 0.034741076886942744, -0.8123956655755472, -0.04688727359339901, 0.2019353311723676, 0.20311115159355098, -0.03935654449686459, 0.06967517803561558, -0.015536553796198381, -0.034052746766077585, -0.065280247534967

In [8]:
from collections import defaultdict

def analyze_vocab_coverage(sample_data, stoi_dict):
    # Đếm tần suất từ duy nhất
    word_freq = defaultdict(int)

    for text in sample_data['articles'] + sample_data['summaries']:
        tokens = tokenize(text)
        for token in tokens:
            word_freq[token] += 1

    # Phân loại từ vào known / unknown
    known_words = set()
    unknown_words = set()

    for word in word_freq:
        if word in stoi_dict:
            known_words.add(word)
        else:
            unknown_words.add(word)

    total_unique_words = len(known_words) + len(unknown_words)
    coverage = len(known_words) / total_unique_words * 100 if total_unique_words > 0 else 0.0
    print("A word not in dict: ", random.choice(list(unknown_words)))
    return {
        'total_unique_words': total_unique_words,
        'known_unique_words': len(known_words),
        'unknown_unique_words': len(unknown_words),
        'coverage_percentage': coverage,
    }
def print_vocab_stats(name, stats):
    print(f"\n{name} Vocabulary Coverage:")
    print(f"- Unique words: {stats['total_unique_words']}")
    print(f"- Exist in dict: {stats['known_unique_words']}")
    print(f"- Outside the dict: {stats['unknown_unique_words']}")
    print(f"- Coverage rate: {stats['coverage_percentage']:.2f}%")

print_vocab_stats("Train", analyze_vocab_coverage(train_sample, stoi_dict))
print_vocab_stats("Validation", analyze_vocab_coverage(validation_sample, stoi_dict))
print_vocab_stats("Test", analyze_vocab_coverage(test_sample, stoi_dict))


A word not in dict:  people.This

Train Vocabulary Coverage:
- Unique words: 169676
- Exist in dict: 52627
- Outside the dict: 117049
- Coverage rate: 31.02%
A word not in dict:  Edie

Validation Vocabulary Coverage:
- Unique words: 29321
- Exist in dict: 15786
- Outside the dict: 13535
- Coverage rate: 53.84%
A word not in dict:  Rawson.Siem

Test Vocabulary Coverage:
- Unique words: 65420
- Exist in dict: 28522
- Outside the dict: 36898
- Coverage rate: 43.60%
