In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms


In [20]:
train_data = pd.read_csv("../dataset/train.csv")
validation_data = pd.read_csv("../dataset/validation.csv")
test_data = pd.read_csv("../dataset/test.csv")

train_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)
validation_data.rename(columns={"highlights": "summaries","article":"articles"}, inplace=True)
test_data.rename(columns={"highlights": "summaries", "article":"articles"}, inplace=True)


In [22]:
train_data["article_word_count"] = train_data["articles"].astype(str).apply(lambda x: len(x.split()))
train_data["summary_word_count"] = train_data["summaries"].astype(str).apply(lambda x: len(x.split()))

validation_data["article_word_count"] = validation_data["articles"].astype(str).apply(lambda x: len(x.split()))
validation_data["summary_word_count"] = validation_data["summaries"].astype(str).apply(lambda x: len(x.split()))

test_data["article_word_count"] = test_data["articles"].astype(str).apply(lambda x: len(x.split()))
test_data["summary_word_count"] = test_data["summaries"].astype(str).apply(lambda x: len(x.split()))

train_sample = train_data.sample(frac=0.01, random_state=1)
validation_sample = validation_data.sample(frac=0.01, random_state=1)
test_sample = test_data.sample(frac=0.01, random_state=1)


In [24]:
max_len_article = train_sample["article_word_count"].max()
print(max_len_article)
max_len_summary = train_sample["summary_word_count"].max()
print(max_len_summary)

1899
704


In [9]:
EMBEDDING_FILE = "../Embedding/glove-wiki-gigaword-100.txt"
vocab, embeddings = [], []
with open(EMBEDDING_FILE, 'rt', encoding='utf-8') as ef:
    full_content = ef.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    i_embeddings.extend([0.0, 0.0, 0.0, 0.0])
    vocab.append(i_word)
    embeddings.append(i_embeddings)

embs_npa = np.array(embeddings)

unk_embedding = np.mean(embs_npa, axis=0).tolist()

dim = embs_npa.shape[1]
sos_embedding = [0.0] * dim
sos_embedding[-3] = 1.0
eos_embedding = [0.0] * dim
eos_embedding[-2] = 1.0
pad_embedding = [0.0] * dim
pad_embedding[-4] = 1.0
# unk_embedding = [0.0] * dim
# unk_embedding[-1] = 1.0

# Update vocab and embeddings
vocab = ["<PAD>", "<SOS>", "<EOS>", "<UNK>"] + vocab
embeddings = [pad_embedding, sos_embedding,
              eos_embedding, unk_embedding] + embeddings

vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)


def tokenize(text):
    return text.lower().strip().split()


stoi_dict = {word: idx for idx, word in enumerate(vocab_npa)}
_unk_idx = stoi_dict["<UNK>"]


def stoi(string, stoi_dict):
    return stoi_dict.get(string, _unk_idx)


def numericalize(text):
    tokenized_text = tokenize(text)
    return [
        stoi(token)
        for token in tokenized_text
    ]

print(embs_npa.shape[0])

25004


In [None]:
transform = transforms.ToTensor()

class Seq2SeqDataset(Dataset):
    def __init__(self, articles_id,articles, summaries, max_len_article, max_len_summary, transform = None):
        super().__init__()
        self.articles_id =  articles_id
        self.articles = articles
        self.summaries = summaries
        self.max_len_article = max_len_article
        self.max_len_summary = max_len_summary
        
        self.articles_numericalized = []
        self.summaries_numericalized = []
        
        for article, summary in zip(articles, summaries):

                numericalized_article = [stoi("<SOS>")] + numericalize(article) + [stoi("<EOS>")]
                numericalized_article = numericalized_article[:max_len_article] + [stoi("<PAD>")] * max(0, max_len_article - len(numericalized_article))

                numericalized_summary = [stoi("<SOS>")] + numericalize(summary) + [stoi("<EOS>")]
                numericalized_summary = numericalized_summary[:max_len_summary] + [stoi("<PAD>")] * max(0, max_len_summary - len(numericalized_summary))

                # **Tạo danh sách `next_token`**
                for idx in range(min(len(numericalized_summary), max_len_summary) - 1):
                    self.articles_numericalized.append(numericalized_article)  # Giữ nguyên article
                    self.summaries_numericalized.append(numericalized_summary[:idx+1] + [stoi('<PAD>')] * max_len_summary)
                    self.summaries_numericalized[-1] = self.summaries_numericalized[-1][:max_len_summary]  # Cắt đúng max_len
                    self.next_tokens.append(numericalized_summary[idx + 1])  # Lưu từ tiếp theo cần dự đoán

    def __len__(self):
        return len(self.summaries_numericalized)

    def __getitem__(self, idx):
        article = torch.tensor(self.articles_numericalized[idx])
        summary = torch.tensor(self.summaries_numericalized[idx])
        next_token = torch.tensor(self.next_tokens[idx])  # Token tiếp theo cần dự đoán
        return article, summary, next_token

def seq2seq_collate(batch):
    articles = torch.stack([item[0] for item in batch])
    summaries = torch.stack([item[1] for item in batch])
    next_tokens = torch.tensor([item[2] for item in batch])

    return articles, summaries, next_tokens

In [None]:
class LSTM(nn.Module):
    def init (self, num_emb, num_layers=1, emb_size=128, hidden_size=128):
        super(LSTM, self). init ()
        self.embedding = nn.Embedding(num_emb, emb_size)

        self.mlp_emb = nn.Sequential(nn.Linear(emb_size, emb_size),
                                     nn.LayerNorm(emb_size),
                                     nn.ELU(),nn.Linear(emb_size, emb_size))

        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)

        self.attention = nn.MultiheadAttention(embed_dim=hidden_size,
                                               num_heads=8,batch_first=True,
                                               dropout=0.1)

        self.mlp_out = nn.Sequential(nn.Linear(hidden_size, hidden_size//2),
                                     nn.LayerNorm(hidden_size//2),
                                     nn.ELU(),
                                     nn.Dropout(0.5),
                                     nn.Linear(hidden_size//2, num_emb))

    def forward(self, input_token, hidden_seq, hidden_in, mem_in):
        input_embs = self.embedding(input_token)
        input_embs = self.mlp_emb(input_embs)

        # Pass Through LSTM
        output, (hidden_out, mem_out) = self.lstm(input_embs, (hidden_in, mem_in))
        # Log the output of the final LSTM layer
        hidden_seq += [output]
        hidden_cat = torch.cat(hidden_seq, 1)

        # Cast attention over the outputs of the LSTM from all previous steps
        # Use a single query from the current timestep
        # Keys and Values created from the outputs of LSTM from all timesteps
        attn_output, attn_output_weights = self.attention(output, hidden_cat, hidden_cat) # Q, K, V
        attn_output = attn_output + output

        return self.mlp_out(attn_output), hidden_seq, hidden_out, mem_out

In [None]:
from keras import backend as K 
K.clear_session()

latent_dim = 300
embedding_dim=100

# Encoder
encoder_inputs = Input(shape=(max_text_len,))

#embedding layer
enc_emb =  Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# Concat attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#dense layer
decoder_dense =  TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary() 

In [None]:
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])


In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence