In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-summary/news_summary_more.csv
/kaggle/input/news-summary/news_summary.csv


In [29]:
import pandas as pd

# Load only the first 1000 rows
df = pd.read_csv("/kaggle/input/news-summary/news_summary.csv", encoding='latin-1', nrows=1500)


In [30]:
df.shape

(1500, 6)

In [31]:
!pip install nltk



In [32]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import re
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Download necessary NLTK data (if not already downloaded)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [34]:
df.rename(columns={'text': 'summary', 'ctext': 'text'}, inplace=True)

In [35]:
df=df.dropna()

In [36]:
# Preprocessing
def clean_text(text):
      text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
      text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
      text = text.lower()
      return text



In [37]:
df['text'] = df['text'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)

# Add start and end tokens to the summary
df['summary'] = df['summary'].apply(lambda x: '<start> ' + x + ' <end>')

# Split into training and validation sets
train_df, val_df = train_test_split(df[['text', 'summary']], test_size=0.2, random_state=42)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print(train_df.head())
print(val_df.head())

                                                text  \
0  mumbai customs arrested one saudi arabian airl...   
1  a wax figure of beyonce at madame tussauds in ...   
2  more than  people were evacuated on saturday w...   
3  the karni sena who were in the limelight for t...   
4  on this day  years ago mahatma gandhi had brea...   

                                             summary  
0  <start> the air intelligence unit of the mumba...  
1  <start> singer beyoncs wax statue at madame tu...  
2  <start> more than  people were evacuated on sa...  
3  <start> rajasthan minister vasudev devnani sai...  
4  <start> mahatma gandhi was assassinated on jan...  
                                                text  \
0  gains vhp new delhi feb  pti the vishwa hindu ...   
1  a yearold jain temple at byculla is the latest...   
2  a special cbi court on wednesday asked banks t...   
3  sheikhs from egypts highest islamic authority ...   
4  maharashtra gujarat and uttar pradesh have eme... 

In [38]:
# Tokenize the text and summary
def tokenize(text):
    return word_tokenize(text)  # Uses nltk.word_tokenize

train_df['text_tokens'] = train_df['text'].apply(tokenize)
train_df['summary_tokens'] = train_df['summary'].apply(tokenize)
val_df['text_tokens'] = val_df['text'].apply(tokenize)
val_df['summary_tokens'] = val_df['summary'].apply(tokenize)

# Create vocabulary
def build_vocabulary(tokens_list, max_size=None):
    counts = Counter()
    for tokens in tokens_list:
        counts.update(tokens)

    if max_size is None:
        vocabulary = [token for token, count in counts.items()]
    else:
        vocabulary = [token for token, count in counts.most_common(max_size)]

    word_to_index = {token: idx for idx, token in enumerate(vocabulary, start=2)}  # 0 and 1 are reserved
    word_to_index['<pad>'] = 0  # Padding token
    word_to_index['<unk>'] = 1  # Unknown token

    #*** ENSURE SPECIAL TOKENS ARE ALWAYS IN THE VOCABULARY ***#
    if '<start>' not in word_to_index:
        word_to_index['<start>'] = len(word_to_index)
    if '<end>' not in word_to_index:
        word_to_index['<end>'] = len(word_to_index)

    index_to_word = {idx: token for token, idx in word_to_index.items()}

    return word_to_index, index_to_word

all_text_tokens = [token for sublist in train_df['text_tokens'] for token in sublist]
all_summary_tokens = [token for sublist in train_df['summary_tokens'] for token in sublist]

text_word_to_index, text_index_to_word = build_vocabulary(train_df['text_tokens'])
summary_word_to_index, summary_index_to_word = build_vocabulary(train_df['summary_tokens'])

TEXT_VOCAB_SIZE = len(text_word_to_index)
SUMMARY_VOCAB_SIZE = len(summary_word_to_index)

print(f"Text Vocabulary Size: {TEXT_VOCAB_SIZE}")
print(f"Summary Vocabulary Size: {SUMMARY_VOCAB_SIZE}")

Text Vocabulary Size: 34985
Summary Vocabulary Size: 10380


In [39]:
# Convert tokens to indices
def tokens_to_indices(tokens, word_to_index):
    return [word_to_index.get(token, word_to_index['<unk>']) for token in tokens]

train_df['text_indices'] = train_df['text_tokens'].apply(lambda x: tokens_to_indices(x, text_word_to_index))
train_df['summary_indices'] = train_df['summary_tokens'].apply(lambda x: tokens_to_indices(x, summary_word_to_index))
val_df['text_indices'] = val_df['text_tokens'].apply(lambda x: tokens_to_indices(x, text_word_to_index))
val_df['summary_indices'] = val_df['summary_tokens'].apply(lambda x: tokens_to_indices(x, summary_word_to_index))

# Dataset class
class SummaryDataset(Dataset):
    def __init__(self, text_indices, summary_indices):
        self.text_indices = text_indices
        self.summary_indices = summary_indices

    def __len__(self):
        return len(self.text_indices)

    def __getitem__(self, idx):
        return torch.tensor(self.text_indices[idx]), torch.tensor(self.summary_indices[idx])

# Collate function for padding
def collate_fn(batch):
    text_inputs, summary_inputs = zip(*batch)
    text_lengths = [len(text) for text in text_inputs]
    summary_lengths = [len(summary) for summary in summary_inputs]

    # Pad sequences
    text_inputs = [torch.cat([text, torch.zeros(max(text_lengths) - len(text))]) for text in text_inputs]
    summary_inputs = [torch.cat([summary, torch.zeros(max(summary_lengths) - len(summary))]) for summary in summary_inputs]

    text_inputs = torch.stack(text_inputs).long()  # Convert to Long tensors
    summary_inputs = torch.stack(summary_inputs).long()

    return text_inputs, summary_inputs, torch.tensor(text_lengths), torch.tensor(summary_lengths)

# Create Datasets
train_dataset = SummaryDataset(train_df['text_indices'].tolist(), train_df['summary_indices'].tolist())
val_dataset = SummaryDataset(val_df['text_indices'].tolist(), val_df['summary_indices'].tolist())

# Create DataLoaders
BATCH_SIZE = 2
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Example usage
for text_batch, summary_batch, text_lengths, summary_lengths in train_dataloader:
    print("Text Batch Shape:", text_batch.shape)
    print("Summary Batch Shape:", summary_batch.shape)
    print("Text Lengths:", text_lengths)
    print("Summary Lengths:", summary_lengths)
    break

Text Batch Shape: torch.Size([2, 284])
Summary Batch Shape: torch.Size([2, 66])
Text Lengths: tensor([284, 252])
Summary Lengths: tensor([66, 55])


In [40]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_prob=0.5):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_prob)  # Add dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.hidden_dim = hidden_dim

    def forward(self, input_seq, input_lengths):
        embedded = self.embedding(input_seq)  # (batch_size, seq_len, embedding_dim)
        embedded = self.dropout(embedded)     # Apply dropout
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, (hidden, cell) = self.lstm(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden, cell


class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)  # Corrected input dimension
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask):
        # hidden: (1, batch_size, hidden_dim) - hidden state from the decoder
        # encoder_outputs: (batch_size, seq_len, hidden_dim) - outputs from the encoder
        # mask: (batch_size, seq_len) - mask indicating valid positions in the input sequence

        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)

        # Repeat decoder hidden state to match sequence length
        hidden = hidden.repeat(seq_len, 1, 1).transpose(0, 1)  # (batch_size, seq_len, hidden_dim)

        # Calculate attention scores
        attn_combined = torch.cat((hidden, encoder_outputs), dim=2) # (batch_size, seq_len, hidden_dim * 2)
        attn_weights = torch.tanh(self.attn(attn_combined)) # (batch_size, seq_len, hidden_dim)
        attn_weights = self.v(attn_weights).squeeze(2) # (batch_size, seq_len)

        # Apply mask to zero out attention weights for padding positions
        attn_weights = attn_weights.masked_fill(mask == 0, -1e9)

        # Normalize attention weights
        attn_weights = torch.softmax(attn_weights, dim=1) # (batch_size, seq_len)

        # Calculate context vector
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1) # (batch_size, hidden_dim)

        return context, attn_weights


class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, attention, dropout_prob=0.5):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_prob)  # Add dropout
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.out = nn.Linear(hidden_dim, vocab_size)
        self.attention = attention

    def forward(self, input_seq, hidden, cell, encoder_outputs, mask):
        embedded = self.embedding(input_seq.unsqueeze(1))  # (batch_size, 1, embedding_dim)
        embedded = self.dropout(embedded)                  # Apply dropout
        context, attn_weights = self.attention(hidden, encoder_outputs, mask)
        embedded_context = torch.cat((embedded.squeeze(1), context), dim=1).unsqueeze(1)
        output, (hidden, cell) = self.lstm(embedded_context, (hidden, cell))
        prediction = self.out(output.squeeze(1))
        return prediction, hidden, cell, attn_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def create_mask(self, text_inputs, text_lengths):
        # Create a mask to ignore padding positions
        mask = torch.arange(text_inputs.size(1)).unsqueeze(0).to(self.device) < text_lengths.unsqueeze(1).to(self.device)
        return mask.float()

    def forward(self, text_inputs, summary_inputs, text_lengths, summary_lengths, teacher_forcing_ratio=0.5):
        # text_inputs: (batch_size, seq_len) - input sequence
        # summary_inputs: (batch_size, seq_len) - target sequence
        # text_lengths: (batch_size) - lengths of input sequences
        # summary_lengths: (batch_size) - lengths of target sequences

        batch_size = text_inputs.size(0)
        summary_len = summary_inputs.size(1)
        summary_vocab_size = self.decoder.embedding.num_embeddings

        # Initialize output tensor to store predictions
        outputs = torch.zeros(batch_size, summary_len, summary_vocab_size).to(self.device)

        # Encoder forward pass
        encoder_outputs, hidden, cell = self.encoder(text_inputs, text_lengths)

        # Create mask for attention mechanism
        mask = self.create_mask(text_inputs, text_lengths)

        # First input to the decoder is the <start> token
        input_seq = summary_inputs[:, 0] # (batch_size)

        # Decoder forward pass
        for t in range(1, summary_len):
            output, hidden, cell, attn_weights = self.decoder(input_seq, hidden, cell, encoder_outputs, mask) # output: (batch_size, vocab_size)
            outputs[:, t] = output

            # Teacher forcing: use the ground truth as the next input
            # Without teacher forcing: use the predicted word as the next input
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1) # (batch_size)
            input_seq = summary_inputs[:, t] if teacher_force else top1

        return outputs

In [41]:
# Training parameters
EMBEDDING_DIM = 128
HIDDEN_DIM = 512
LEARNING_RATE = 0.001
NUM_EPOCHS = 10

# Model instantiation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
attention = Attention(HIDDEN_DIM).to(device)
encoder = Encoder(TEXT_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM).to(device)
decoder = Decoder(SUMMARY_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, attention).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Loss function
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding token in loss calculation

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for i, (text_inputs, summary_inputs, text_lengths, summary_lengths) in enumerate(dataloader):
        print(f"Batch {i+1}/{len(dataloader)}") # Track batch progress
        text_inputs = text_inputs.to(device)
        summary_inputs = summary_inputs.to(device)
        text_lengths = text_lengths.to(device)
        summary_lengths = summary_lengths.to(device)

        optimizer.zero_grad()
        outputs = model(text_inputs, summary_inputs, text_lengths, summary_lengths)

        # Reshape outputs and targets for loss calculation
        outputs = outputs[:, 1:].reshape(-1, decoder.embedding.num_embeddings)
        summary_inputs = summary_inputs[:, 1:].reshape(-1)

        loss = criterion(outputs, summary_inputs)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


def evaluate_epoch(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for text_inputs, summary_inputs, text_lengths, summary_lengths in dataloader:
            text_inputs = text_inputs.to(device)
            summary_inputs = summary_inputs.to(device)
            text_lengths = text_lengths.to(device)
            summary_lengths = summary_lengths.to(device)

            outputs = model(text_inputs, summary_inputs, text_lengths, summary_lengths, teacher_forcing_ratio=0)  # No teacher forcing during evaluation

            # Reshape outputs and targets for loss calculation
            outputs = outputs[:, 1:].reshape(-1, decoder.embedding.num_embeddings) # Remove <start> token prediction
            summary_inputs = summary_inputs[:, 1:].reshape(-1) # Remove <start> token

            loss = criterion(outputs, summary_inputs)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
    val_loss = evaluate_epoch(model, val_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

print("Training complete!")

Batch 1/588
Batch 2/588
Batch 3/588
Batch 4/588
Batch 5/588
Batch 6/588
Batch 7/588
Batch 8/588
Batch 9/588
Batch 10/588
Batch 11/588
Batch 12/588
Batch 13/588
Batch 14/588
Batch 15/588
Batch 16/588
Batch 17/588
Batch 18/588
Batch 19/588
Batch 20/588
Batch 21/588
Batch 22/588
Batch 23/588
Batch 24/588
Batch 25/588
Batch 26/588
Batch 27/588
Batch 28/588
Batch 29/588
Batch 30/588
Batch 31/588
Batch 32/588
Batch 33/588
Batch 34/588
Batch 35/588
Batch 36/588
Batch 37/588
Batch 38/588
Batch 39/588
Batch 40/588
Batch 41/588
Batch 42/588
Batch 43/588
Batch 44/588
Batch 45/588
Batch 46/588
Batch 47/588
Batch 48/588
Batch 49/588
Batch 50/588
Batch 51/588
Batch 52/588
Batch 53/588
Batch 54/588
Batch 55/588
Batch 56/588
Batch 57/588
Batch 58/588
Batch 59/588
Batch 60/588
Batch 61/588
Batch 62/588
Batch 63/588
Batch 64/588
Batch 65/588
Batch 66/588
Batch 67/588
Batch 68/588
Batch 69/588
Batch 70/588
Batch 71/588
Batch 72/588
Batch 73/588
Batch 74/588
Batch 75/588
Batch 76/588
Batch 77/588
Batch 78

In [42]:
def summarize(model, text, text_word_to_index, summary_index_to_word, device, max_summary_length=50):
    model.eval()
    with torch.no_grad():
        # Tokenize and convert to indices
        tokens = tokenize(text)
        indices = tokens_to_indices(tokens, text_word_to_index)
        text_input = torch.tensor(indices).unsqueeze(0).to(device)  # Add batch dimension

        text_length = torch.tensor([len(indices)]).to(device) # Create length tensor

        # Encoder forward pass
        encoder_outputs, hidden, cell = model.encoder(text_input, text_length)

        # Create mask
        mask = model.create_mask(text_input, text_length)

        # Initialize decoder input with <start> token
        input_seq = torch.tensor([summary_word_to_index['<start>']]).to(device)

        summary_words = []
        for _ in range(max_summary_length):
            output, hidden, cell, _ = model.decoder(input_seq, hidden, cell, encoder_outputs, mask)
            predicted_token_index = output.argmax(1).item()

            predicted_word = summary_index_to_word[predicted_token_index]
            summary_words.append(predicted_word)

            if predicted_word == '<end>':
                break

            input_seq = torch.tensor([predicted_token_index]).to(device)

        return ' '.join(summary_words)

In [43]:
# Example usage
text = val_df['text'][0]
summary = summarize(model, text, text_word_to_index, summary_index_to_word, device)
print("Original Text:", text)
print("Generated Summary:", summary)

Original Text: gains vhp new delhi feb  pti the vishwa hindu parishad vhp today said nda constituent shiv sena was never a hindutvavadi outfit and was pretending to be one only for political gains speaking to reporters here vhp joint general secretary surendra jain said that merely chanting king shivajis name does not make shiv sena a hindutvavadi one championing the hindutva cause party i dont think shiv sena was ever a hindutvavadi organisation it is the media which gave this tag to shiv sena after babri mosque was demolished jain said it is pretending to be a hindutuvavadi organisation for political gains he added his remark comes against the backdrop of stinging verbal exchanges between longstanding allies shiv sena and bjp leaders ahead of maharashtra civic polls voting for which was held today seeking to buttress his claim the vhp leader stated late shiv sena founder bal thackeray himself was clueless about his party workers bringing portions of the mosque down in uttar pradeshs 