### **BASIC MODEL USING GRUs**

**Custom dataset definition**

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
import json
import re
import copy
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from rouge_score import rouge_scorer
from transformers import BertTokenizer
from gensim.models import KeyedVectors
from tqdm import tqdm
import pickle
import os

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def load_word2vec_embeddings(path='../data/GoogleNews-vectors-negative300.bin'):
    print("Loading Word2Vec embeddings...")
    word2vec = KeyedVectors.load_word2vec_format(path, binary=True)
    embedding_dim = word2vec.vector_size
    return word2vec, embedding_dim

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

def clean_article_heading(article):
    pattern = r'By\s*\.\s*.*?\s*\.\s*PUBLISHED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.\s*\|\s*\.\s*UPDATED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.'
    cleaned_text = re.sub(pattern, '', article)
    return cleaned_text.strip()

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [2]:
train_df = pd.read_csv('../data/train.csv', header=None)
test_df = pd.read_csv('../data/test.csv', header=None)
val_df = pd.read_csv('../data/validation.csv', header=None)

columns = ['id', 'article', 'summary']
train_df.columns = columns
test_df.columns = columns
val_df.columns = columns

train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['article'] = train_df['article'].apply(clean_article_heading)
train_df['summary'] = train_df['summary'].apply(preprocess_text)

test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(clean_article_heading)
test_df['summary'] = test_df['summary'].apply(preprocess_text)

val_df['article'] = val_df['article'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(clean_article_heading)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

print(f"Training dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")
print(f"Validation dataframe shape: {val_df.shape}")

train_df = train_df.sample(n=20000, random_state=42)
test_df = test_df.sample(n=2000, random_state=42)
val_df = val_df.sample(n=2000, random_state=42)

Training dataframe shape: (287114, 3)
Test dataframe shape: (11491, 3)
Validation dataframe shape: (13369, 3)


**CUSTOM DATASET**

In [3]:
class SummarizationDataset(Dataset):
    """Dataset class for summarization task with BERT tokenizer"""
    
    def __init__(self, articles, summaries, tokenizer_wrapper, max_article_len=512, max_summary_len=128):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer_wrapper = tokenizer_wrapper
        self.max_article_len = max_article_len
        self.max_summary_len = max_summary_len
    
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        article = self.articles[idx]
        summary = self.summaries[idx]
        
        # Convert to sequences
        article_seq = self.tokenizer_wrapper.text_to_sequence(article, self.max_article_len)
        summary_seq = self.tokenizer_wrapper.text_to_sequence(summary, self.max_summary_len)
        
        # Add SOS and EOS tokens to summary
        sos_idx = self.tokenizer_wrapper.vocab_to_int['[SOS]']
        eos_idx = self.tokenizer_wrapper.vocab_to_int['[EOS]']
        summary_with_tokens = [sos_idx] + summary_seq + [eos_idx]
        
        return {
            'article_input_ids': torch.tensor(article_seq, dtype=torch.long),
            'article_attention_mask': torch.ones(len(article_seq), dtype=torch.long),
            'summary_input_ids': torch.tensor(summary_with_tokens, dtype=torch.long)
        }

def collate_fn(batch):
    """Custom collate function for DataLoader"""
    # Extract sequences
    article_input_ids = [item['article_input_ids'] for item in batch]
    article_attention_masks = [item['article_attention_mask'] for item in batch]
    summary_input_ids = [item['summary_input_ids'] for item in batch]
    
    # Pad sequences
    article_input_ids_padded = pad_sequence(article_input_ids, batch_first=True, padding_value=0)
    article_attention_masks_padded = pad_sequence(article_attention_masks, batch_first=True, padding_value=0)
    summary_input_ids_padded = pad_sequence(summary_input_ids, batch_first=True, padding_value=0)
    
    return {
        'article_input_ids': article_input_ids_padded,
        'article_attention_mask': article_attention_masks_padded,
        'summary_input_ids': summary_input_ids_padded
    }

**SENTENCE TOKENIZATION AND WORD EMBEDDINGS**

In [4]:
class BertTokenizerWrapper:
    """Wrapper to handle BERT tokenizer with custom vocabulary for Word2Vec"""
    
    def __init__(self, model_name='bert-base-uncased', max_vocab_size=10000):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.max_vocab_size = max_vocab_size
        self.vocab_to_int = {}
        self.int_to_vocab = {}
        self.word_to_subwords = {}
        
    def build_vocabulary_from_word2vec(self, word2vec_model, texts):
        """Build vocabulary from Word2Vec model and training texts"""
        # Start with special tokens
        special_tokens = ['[PAD]', '[UNK]', '[SOS]', '[EOS]']
        self.vocab_to_int = {token: idx for idx, token in enumerate(special_tokens)}
        self.int_to_vocab = {idx: token for idx, token in enumerate(special_tokens)}
        
        # Extract words from texts and filter by Word2Vec vocabulary
        word_freq = Counter()
        for text in texts:
            # Use BERT tokenizer to get subwords, then extract unique words
            tokens = self.tokenizer.tokenize(text.lower())
            # Convert subwords back to words for Word2Vec lookup
            words = []
            current_word = ""
            for token in tokens:
                if token.startswith('##'):
                    current_word += token[2:]
                else:
                    if current_word:
                        words.append(current_word)
                    current_word = token
            if current_word:
                words.append(current_word)
            
            # Only count words that exist in Word2Vec
            valid_words = [word for word in words if word in word2vec_model.key_to_index]
            word_freq.update(valid_words)
        
        # Sort by frequency and add to vocabulary
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        vocab_words = [word for word, freq in sorted_words[:self.max_vocab_size - len(special_tokens)]]
        
        for word in vocab_words:
            idx = len(self.vocab_to_int)
            self.vocab_to_int[word] = idx
            self.int_to_vocab[idx] = word
            # Store subword mapping
            self.word_to_subwords[word] = self.tokenizer.tokenize(word)
        
        print(f"Built vocabulary with {len(self.vocab_to_int)} tokens")
        print(f"Words from Word2Vec: {len(vocab_words)}")
    
    def text_to_sequence(self, text, max_length=512):
        """Convert text to sequence of word indices"""
        # Tokenize with BERT
        subword_tokens = self.tokenizer.tokenize(text.lower())
        
        # Convert subwords back to words
        words = []
        current_word = ""
        for token in subword_tokens:
            if token.startswith('##'):
                current_word += token[2:]
            else:
                if current_word:
                    words.append(current_word)
                current_word = token
        if current_word:
            words.append(current_word)
        
        # Convert to indices
        sequence = []
        for word in words[:max_length]:
            if word in self.vocab_to_int:
                sequence.append(self.vocab_to_int[word])
            else:
                sequence.append(self.vocab_to_int['[UNK]'])
        
        return sequence
    
    def sequence_to_text(self, sequence):
        """Convert sequence of indices back to text"""
        words = []
        for idx in sequence:
            if isinstance(idx, torch.Tensor):
                idx = idx.item()
            if idx in [0, 2, 3]:  # Skip PAD, SOS, EOS
                if idx == 3:  # Stop at EOS
                    break
                continue
            if idx in self.int_to_vocab:
                words.append(self.int_to_vocab[idx])
        return ' '.join(words)

class Word2VecEmbeddings:
    """Word2Vec embeddings handler for custom vocabulary"""
    
    def __init__(self, word2vec_model, embedding_dim):
        self.word2vec_model = word2vec_model
        self.embedding_dim = embedding_dim
    
    def create_embedding_matrix(self, tokenizer_wrapper):
        """Create embedding matrix for the custom vocabulary"""
        vocab_size = len(tokenizer_wrapper.vocab_to_int)
        embedding_matrix = np.random.normal(0, 0.1, (vocab_size, self.embedding_dim))
        
        found_words = 0
        for word, idx in tokenizer_wrapper.vocab_to_int.items():
            if word in ['[PAD]', '[UNK]', '[SOS]', '[EOS]']:
                # Initialize special tokens with small random values
                embedding_matrix[idx] = np.random.normal(0, 0.01, self.embedding_dim)
            elif word in self.word2vec_model.key_to_index:
                embedding_matrix[idx] = self.word2vec_model[word]
                found_words += 1
        
        print(f"Found Word2Vec embeddings for {found_words}/{vocab_size-4} words")
        return torch.FloatTensor(embedding_matrix)

**MODEL DEFINITION**

In [5]:
class Encoder(nn.Module):
    """Encoder with GRU and Word2Vec embeddings"""
    
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3, embedding_matrix=None):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Initialize embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(embedding_matrix)
            
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, 
                         batch_first=True, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, attention_mask=None):
        # x shape: (batch_size, seq_len)
        embedded = self.dropout(self.embedding(x))
        # embedded shape: (batch_size, seq_len, embed_dim)
        
        outputs, hidden = self.gru(embedded)
        # outputs shape: (batch_size, seq_len, hidden_dim * 2)
        # hidden shape: (num_layers * 2, batch_size, hidden_dim)
        
        return outputs, hidden

class Attention(nn.Module):
    """Attention mechanism"""
    
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, decoder_hidden, encoder_outputs, encoder_mask=None):
        # decoder_hidden shape: (batch_size, hidden_dim)
        # encoder_outputs shape: (batch_size, seq_len, hidden_dim * 2)
        
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        
        # Repeat decoder hidden state
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Concatenate and compute attention scores
        energy = torch.cat([decoder_hidden, encoder_outputs], dim=2)
        energy = torch.tanh(self.attn(energy))
        attention_scores = self.v(energy).squeeze(2)
        
        # Apply mask if provided
        if encoder_mask is not None:
            attention_scores = attention_scores.masked_fill(encoder_mask == 0, -1e9)
        
        # Apply softmax
        attention_weights = torch.softmax(attention_scores, dim=1)
        
        # Compute context vector
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        context = context.squeeze(1)
        
        return context, attention_weights

class Decoder(nn.Module):
    """Decoder with attention and Word2Vec embeddings"""
    
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1, dropout=0.3, embedding_matrix=None):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        
        # Initialize embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(embedding_matrix)
            
        self.attention = Attention(hidden_dim)
        self.gru = nn.GRU(embed_dim + hidden_dim * 2, hidden_dim, num_layers, 
                         batch_first=True, dropout=dropout)
        self.output_projection = nn.Linear(hidden_dim * 3, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_token, decoder_hidden, encoder_outputs, encoder_mask=None):
        embedded = self.dropout(self.embedding(input_token))
        
        # Get attention context
        context, attention_weights = self.attention(decoder_hidden[-1], encoder_outputs, encoder_mask)
        
        # Concatenate embedding and context
        context = context.unsqueeze(1)
        gru_input = torch.cat([embedded, context], dim=2)
        
        output, decoder_hidden = self.gru(gru_input, decoder_hidden)
        
        # Final prediction
        prediction_input = torch.cat([output.squeeze(1), context.squeeze(1)], dim=1)
        prediction = self.output_projection(prediction_input)
        
        return prediction, decoder_hidden, attention_weights

class SummarizationModel(nn.Module):
    """Complete summarization model"""
    
    def __init__(self, vocab_size, embed_dim=300, hidden_dim=512, num_layers=2, dropout=0.3, embedding_matrix=None):
        super(SummarizationModel, self).__init__()
        self.encoder = Encoder(vocab_size, embed_dim, hidden_dim, num_layers, dropout, embedding_matrix)
        self.decoder = Decoder(vocab_size, embed_dim, hidden_dim, 1, dropout, embedding_matrix)
        self.hidden_dim = hidden_dim
        
    def forward(self, article_input_ids, article_attention_mask, summary_input_ids=None, teacher_forcing_ratio=0.5):
        batch_size = article_input_ids.size(0)
        
        # Encode article
        encoder_outputs, encoder_hidden = self.encoder(article_input_ids, article_attention_mask)
        
        # Initialize decoder hidden state
        decoder_hidden = encoder_hidden[-2:].mean(dim=0, keepdim=True)
        
        if summary_input_ids is not None:
            # Training mode
            max_length = summary_input_ids.size(1) - 1  # Exclude last token
            outputs = []
            
            for t in range(max_length):
                if t == 0 or torch.rand(1).item() < teacher_forcing_ratio:
                    # Use teacher forcing
                    input_token = summary_input_ids[:, t:t+1]
                else:
                    # Use previous prediction
                    input_token = torch.argmax(outputs[-1], dim=1, keepdim=True)
                
                output, decoder_hidden, _ = self.decoder(
                    input_token, decoder_hidden, encoder_outputs, 
                    encoder_mask=article_attention_mask
                )
                outputs.append(output)
            
            return torch.stack(outputs, dim=1)
        else:
            # Inference mode
            max_length = 100
            outputs = []
            input_token = torch.tensor([[2]], device=article_input_ids.device).repeat(batch_size, 1)  # [SOS]
            
            for t in range(max_length):
                output, decoder_hidden, _ = self.decoder(
                    input_token, decoder_hidden, encoder_outputs,
                    encoder_mask=article_attention_mask
                )
                outputs.append(output)
                input_token = torch.argmax(output, dim=1, keepdim=True)
                
                # Stop if all sequences have generated [EOS]
                if (input_token == 3).all():
                    break
            
            return torch.stack(outputs, dim=1)

**MODEL TRAINING PROCESS DEFINITION**

In [6]:
def train_epoch(model, dataloader, criterion, optimizer, device, clip=1.0):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        
        article_input_ids = batch['article_input_ids'].to(device)
        article_attention_mask = batch['article_attention_mask'].to(device)
        summary_input_ids = batch['summary_input_ids'].to(device)
        
        outputs = model(article_input_ids, article_attention_mask, summary_input_ids)
        
        # Get target sequence excluding the last token and SOS token
        target = summary_input_ids[:, 1:-1].contiguous()
        
        # Ensure outputs match target sequence length
        outputs = outputs[:, :target.size(1), :].contiguous()
        
        # Reshape for loss calculation
        outputs = outputs.view(-1, outputs.shape[-1])
        target = target.view(-1)
        
        loss = criterion(outputs, target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device, tokenizer_wrapper):
    model.eval()
    total_loss = 0
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            article_input_ids = batch['article_input_ids'].to(device)
            article_attention_mask = batch['article_attention_mask'].to(device)
            summary_input_ids = batch['summary_input_ids'].to(device)
            
            outputs = model(article_input_ids, article_attention_mask, summary_input_ids,
                          teacher_forcing_ratio=0.0)
            
            # Get target sequence excluding the last token and SOS token
            target = summary_input_ids[:, 1:-1].contiguous()
            
            # Ensure outputs match target sequence length
            outputs = outputs[:, :target.size(1), :].contiguous()
            
            # Reshape for loss calculation
            outputs_flat = outputs.view(-1, outputs.shape[-1])
            target_flat = target.view(-1)
            
            loss = criterion(outputs_flat, target_flat)
            total_loss += loss.item()
            
            # Calculate ROUGE scores
            predictions = torch.argmax(outputs, dim=-1)
            for pred, target in zip(predictions, summary_input_ids):
                pred_text = tokenizer_wrapper.sequence_to_text(pred)
                target_text = tokenizer_wrapper.sequence_to_text(target)
                
                if pred_text.strip() and target_text.strip():
                    scores = scorer.score(target_text, pred_text)
                    for metric in rouge_scores:
                        rouge_scores[metric].append(scores[metric].fmeasure)
    
    avg_loss = total_loss / len(dataloader)
    avg_rouge_scores = {k: sum(v)/len(v) if v else 0.0 for k, v in rouge_scores.items()}
    
    return avg_loss, avg_rouge_scores

def train_model(model, train_loader, val_loader, optimizer, criterion, device, tokenizer_wrapper, 
                num_epochs=10, early_stopping_patience=3):
    model.to(device)
    best_val_loss = float('inf')
    early_stop_counter = 0
    training_stats = []
    best_model_state = None
    
    print("Starting training...")
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, rouge_scores = evaluate(model, val_loader, criterion, device, tokenizer_wrapper)
        
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print("ROUGE Scores:", rouge_scores)
        
        training_stats.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'rouge_scores': rouge_scores
        })
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            best_model_state = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), 'best_summarizer_model.pth')
            print(f"Saved new best model with validation loss: {val_loss:.4f}")
        else:
            early_stop_counter += 1
            print(f"Early stopping counter: {early_stop_counter}/{early_stopping_patience}")
            
        if early_stop_counter >= early_stopping_patience:
            print("Early stopping triggered!")
            break
    
    # Load the best model before returning
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("Restored best model state before returning")
    
    return model, training_stats

In [7]:
word2vec_model, embedding_dim = load_word2vec_embeddings('../data/GoogleNews-vectors-negative300.bin')

# Initialize BERT tokenizer wrapper
print("Building vocabulary...")
tokenizer_wrapper = BertTokenizerWrapper(max_vocab_size=10000)
all_texts = train_df['article'].tolist() + train_df['summary'].tolist()
tokenizer_wrapper.build_vocabulary_from_word2vec(word2vec_model, all_texts)

# Create Word2Vec embeddings for the vocabulary
print("Creating embedding matrix...")
word2vec_embeddings = Word2VecEmbeddings(word2vec_model, embedding_dim)
embedding_matrix = word2vec_embeddings.create_embedding_matrix(tokenizer_wrapper)

train_dataset = SummarizationDataset(
        train_df['article'].tolist(), 
        train_df['summary'].tolist(), 
        tokenizer_wrapper
    )
val_dataset = SummarizationDataset(
    val_df['article'].tolist(), 
    val_df['summary'].tolist(), 
    tokenizer_wrapper
    )
test_dataset = SummarizationDataset(
    test_df['article'].tolist(), 
    test_df['summary'].tolist(), 
    tokenizer_wrapper
    )

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


Loading Word2Vec embeddings...
Building vocabulary...
Built vocabulary with 10000 tokens
Words from Word2Vec: 9996
Creating embedding matrix...
Found Word2Vec embeddings for 9996/9996 words


In [None]:
print("Initializing model...")
vocab_size = len(tokenizer_wrapper.vocab_to_int)
model = SummarizationModel(
    vocab_size=vocab_size,
    embed_dim=embedding_dim,
    hidden_dim=512,
    num_layers=2,
    dropout=0.3,
    embedding_matrix=embedding_matrix
)

# Initialize optimizer and criterion
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens

# Train model
print("Starting training...")
trained_model, training_stats = train_model(
    model, train_loader, val_loader, optimizer, criterion, device, tokenizer_wrapper,
    num_epochs=15, early_stopping_patience=3
)

# Evaluate on test set
print("Evaluating on test set...")
test_loss, test_rouge_scores = evaluate(trained_model, test_loader, criterion, device, tokenizer_wrapper)
print(f"Test Loss: {test_loss:.4f}")
print("Test ROUGE Scores:", test_rouge_scores)

# Save final model and tokenizer
torch.save({
    'model_state_dict': trained_model.state_dict(),
    'vocab_size': vocab_size,
    'training_stats': training_stats,
    'test_scores': {'loss': test_loss, 'rouge': test_rouge_scores}
}, 'final_summarization_model.pth')

with open('tokenizer_wrapper.pkl', 'wb') as f:
    pickle.dump(tokenizer_wrapper, f)

print("Training completed!")

# Example inference
print("\nExample inference:")
trained_model.eval()
with torch.no_grad():
    sample_idx = 0
    sample_article = test_df.iloc[sample_idx]['article']
    actual_summary = test_df.iloc[sample_idx]['summary']
    
    # Preprocess
    article_seq = tokenizer_wrapper.text_to_sequence(sample_article)
    article_tensor = torch.tensor([article_seq], dtype=torch.long).to(device)
    attention_mask = torch.ones_like(article_tensor).to(device)
    
    # Generate summary
    outputs = trained_model(article_tensor, attention_mask)
    predicted_ids = torch.argmax(outputs, dim=-1)[0]
    
    # Decode
    generated_summary = tokenizer_wrapper.sequence_to_text(predicted_ids)
    
    print(f"Article: {sample_article[:200]}...")
    print(f"Actual Summary: {actual_summary}")
    print(f"Generated Summary: {generated_summary}")

Initializing model...




Starting training...
Starting training...

Epoch 1/15


Training:   2%|▏         | 26/1250 [2:58:52<31:08:48, 91.61s/it]  