In [1]:
import pandas as pd

data = pd.read_csv("xsum_sample.csv")

input_texts = data["document"].tolist()
target_summaries = data["summary"].tolist()


In [2]:
import nltk
from collections import Counter


In [3]:
class Tokenizer:
    def __init__(self, texts, vocab_size=5000):
        self.vocab_size = vocab_size
        self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.build_vocab(texts)

    def tokenize(self, text):
        return nltk.word_tokenize(text.lower())

    def build_vocab(self, texts):
        word_freq = Counter()
        for text in texts:
            tokens = self.tokenize(text)
            word_freq.update(tokens)
        
        most_common = word_freq.most_common(self.vocab_size - len(self.word2idx))
        for idx, (word, _) in enumerate(most_common, start=len(self.word2idx)):
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.word2idx.get(token, self.word2idx["<UNK>"]) for token in tokens]

    def decode(self, indices):
        return " ".join([self.idx2word.get(idx, "<UNK>") for idx in indices])


In [4]:
tokenizer = Tokenizer(
    data["document"].tolist() + data["summary"].tolist(),
    vocab_size=5000
)


In [5]:
import torch

def preprocess_pair(doc, summary, tokenizer, max_len=100):
    # Encode document
    doc_ids = tokenizer.encode(doc)
    doc_ids = doc_ids[:max_len]
    doc_ids += [tokenizer.word2idx["<PAD>"]] * (max_len - len(doc_ids))

    # Encode summary with <SOS> and <EOS>
    sum_ids = tokenizer.encode(summary)
    decoder_input = [tokenizer.word2idx["<SOS>"]] + sum_ids
    decoder_target = sum_ids + [tokenizer.word2idx["<EOS>"]]

    decoder_input = decoder_input[:max_len]
    decoder_target = decoder_target[:max_len]

    decoder_input += [tokenizer.word2idx["<PAD>"]] * (max_len - len(decoder_input))
    decoder_target += [tokenizer.word2idx["<PAD>"]] * (max_len - len(decoder_target))

    return torch.tensor(doc_ids), torch.tensor(decoder_input), torch.tensor(decoder_target)


In [6]:
doc = data["document"][0]
summary = data["summary"][0]

enc_in, dec_in, dec_out = preprocess_pair(doc, summary, tokenizer)

print("Encoder Input:", enc_in[:15])
print("Decoder Input:", dec_in[:15])
print("Decoder Target:", dec_out[:15])


Encoder Input: tensor([   4,  518,  735,    8, 1268,   11,    3, 1489,    6,   59,    8,    4,
         633, 1406,  721])
Decoder Input: tensor([   1,    3, 1185,   34, 1500,  267,    4,  239, 1881,   10, 2127,   10,
        2912,   50, 1960])
Decoder Target: tensor([   3, 1185,   34, 1500,  267,    4,  239, 1881,   10, 2127,   10, 2912,
          50, 1960,  783])


In [7]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


In [8]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_seq, hidden, cell):
        embedded = self.embedding(input_seq)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output)
        return prediction, hidden, cell


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

# Hyperparameters
embedding_dim = 128
hidden_dim = 256
vocab_size = len(tokenizer.word2idx)
num_epochs = 10
learning_rate = 0.001

# Model Initialization
encoder = Encoder(vocab_size, embedding_dim, hidden_dim)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.word2idx["<PAD>"])
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)


In [10]:
import torch
import torch.nn as nn

def train_step(encoder, decoder, data, tokenizer, encoder_optimizer, decoder_optimizer, criterion, device):
    encoder.train()
    decoder.train()

    total_loss = 0

    for i in range(len(data)):
        doc = data["document"][i]
        summary = data["summary"][i]

        # Preprocess inputs
        enc_in, dec_in, dec_out = preprocess_pair(doc, summary, tokenizer)
        enc_in = enc_in.unsqueeze(0).to(device)       # shape: (1, seq_len)
        dec_in = dec_in.unsqueeze(0).to(device)
        dec_out = dec_out.unsqueeze(0).to(device)

        # Reset gradients
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # Forward pass through encoder
        _, hidden, cell = encoder(enc_in)

        # Decoder forward pass
        output, _, _ = decoder(dec_in, hidden, cell)

        # Reshape output to match target
        output = output.view(-1, output.shape[2])     # (batch * seq_len, vocab_size)
        target = dec_out.view(-1)                     # (batch * seq_len)

        # Compute loss
        loss = criterion(output, target)

        # Backpropagation
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data)


In [11]:
def generate_summary(encoder, decoder, input_tensor, tokenizer, max_len=30):
    encoder.eval()
    decoder.eval()
    
    with torch.no_grad():
        # Get encoder outputs
        _, hidden, cell = encoder(input_tensor.unsqueeze(0))  # (1, seq_len) → batch

        # Start token
        decoder_input = torch.tensor([[tokenizer.word2idx["<SOS>"]]])

        output_words = []

        for _ in range(max_len):
            predictions, hidden, cell = decoder(decoder_input, hidden, cell)
            predicted_id = predictions.argmax(2)[:, -1].item()  # Take top word

            if predicted_id == tokenizer.word2idx["<EOS>"]:
                break

            output_words.append(tokenizer.idx2word.get(predicted_id, "<UNK>"))

            decoder_input = torch.tensor([[predicted_id]])

        return " ".join(output_words)


In [12]:
import torch
def summarize(encoder, decoder, doc, tokenizer, device, max_len=50):
    encoder.eval()
    decoder.eval()

    # Preprocess input
    enc_input = tokenizer.encode(doc)
    enc_input = torch.tensor(enc_input).unsqueeze(0).to(device)  # Shape: (1, seq_len)

    # Encoder output
    with torch.no_grad():
        _, hidden, cell = encoder(enc_input)

    # Start decoding with <SOS>
    dec_input = torch.tensor([[tokenizer.word2idx["<SOS>"]]]).to(device)  # Shape: (1, 1)

    decoded_words = []

    for _ in range(max_len):
        with torch.no_grad():
            output, hidden, cell = decoder(dec_input, hidden, cell)

        # Get most probable token
        pred_token = output.argmax(2).item()
        if pred_token == tokenizer.word2idx["<EOS>"]:
            break

        decoded_words.append(tokenizer.idx2word.get(pred_token, "<UNK>"))
        dec_input = torch.tensor([[pred_token]]).to(device)

    return " ".join(decoded_words)


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


encoder.to(device)
decoder.to(device)

# Training for multiple epochs
for epoch in range(num_epochs):
    loss = train_step(encoder, decoder, data, tokenizer,
                      encoder_optimizer, decoder_optimizer,
                      criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}")


Epoch 1/10, Loss: 5.7740
Epoch 2/10, Loss: 4.7009
Epoch 3/10, Loss: 3.9460
Epoch 4/10, Loss: 3.2158
Epoch 5/10, Loss: 2.5714
Epoch 6/10, Loss: 2.0198
Epoch 7/10, Loss: 1.5903
Epoch 8/10, Loss: 1.2504
Epoch 9/10, Loss: 0.9624
Epoch 10/10, Loss: 0.7223


In [15]:
test_paragraph = (
    "The Prime Minister addressed the nation on Tuesday evening, outlining the government’s plans to combat inflation "
    "and stabilize the economy. In his speech, he announced a new fiscal stimulus package worth 10 billion dollars, "
    "aimed at supporting small businesses and low-income families. The package includes tax cuts, subsidies, and increased "
    "social welfare spending. Economists have welcomed the move but cautioned that its success will depend on timely implementation "
    "and transparency."
)

summary = summarize(encoder, decoder, test_paragraph, tokenizer, device)
print("📝 Generated Summary:", summary)



📝 Generated Summary: <UNK> director oliver stone says he <UNK> indian cinema for `` being able to switch between <UNK> , <UNK> and tragedy at the same time '' .
