### DEEP LEARNING

# **TEXT SUMMARIZATION MODEL**

**IMPORTS**

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import ast
import re
import torch.optim as optim
from tqdm import tqdm
import copy
from rouge_score import rouge_scorer
from tqdm.notebook import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from evaluate import load  # Changed from datasets import load_metric
from torch.optim import AdamW

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


**DATA LOADING AND PREPROCESSING**

In [8]:
train_df = pd.read_csv('../data/train.csv', header=None)
test_df = pd.read_csv('../data/test.csv', header=None)
val_df = pd.read_csv('../data/validation.csv', header=None)

columns = ['id', 'article', 'summary']
train_df.columns = columns
test_df.columns = columns
val_df.columns = columns

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

def clean_article_heading(article):
    pattern = r'By\s*\.\s*.*?\s*\.\s*PUBLISHED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.\s*\|\s*\.\s*UPDATED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.'
    cleaned_text = re.sub(pattern, '', article)
    return cleaned_text.strip()

train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['article'] = train_df['article'].apply(clean_article_heading)
train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(clean_article_heading)
test_df['summary'] = test_df['summary'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(clean_article_heading)
val_df['summary'] = val_df['summary'].apply(preprocess_text)



print(f"Training dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")
print(f"Validation dataframe shape: {val_df.shape}")

Training dataframe shape: (287114, 3)
Test dataframe shape: (11491, 3)
Validation dataframe shape: (13369, 3)


In [9]:
pegasusTokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

**Amount reduction for training time optimization**

In [10]:
train_df = train_df.sample(n=20000, random_state=42)
test_df = test_df.sample(n=2000, random_state=42)
val_df = val_df.sample(n=2000, random_state=42)

**CUSTOM DATASET**

In [11]:
class NewsDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_length=512):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.articles)
        
    def __getitem__(self, idx):
        article = str(self.articles[idx])
        summary = str(self.summaries[idx])
        
        article_encoding = self.tokenizer(
            article,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        summary_encoding = self.tokenizer(
            summary,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'article_input_ids': article_encoding['input_ids'].flatten(),
            'article_attention_mask': article_encoding['attention_mask'].flatten(),
            'summary_input_ids': summary_encoding['input_ids'].flatten(),
            'summary_attention_mask': summary_encoding['attention_mask'].flatten()
        }

train_dataset = NewsDataset(
    train_df['article'].values,
    train_df['summary'].values,
    pegasusTokenizer
)

val_dataset = NewsDataset(
    val_df['article'].values,
    val_df['summary'].values,
    pegasusTokenizer
)

test_dataset = NewsDataset(
    test_df['article'].values,
    test_df['summary'].values,
    pegasusTokenizer
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Testing samples: {len(test_dataset)}")

Training samples: 20000
Validation samples: 2000
Testing samples: 2000


In [12]:
class PegasusForSummarization:
    def __init__(self, model_name="google/pegasus-cnn_dailymail", device='cuda'):
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        self.model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
        self.device = device
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
    def train(self, train_loader, val_loader, epochs=3, learning_rate=5e-5, warmup_steps=500, weight_decay=0.01):
        # Set up optimizer
        optimizer = AdamW(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        
        # Set up scheduler
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=warmup_steps, 
            num_training_steps=total_steps
        )
        
        best_val_loss = float('inf')
        best_model = None
        
        for epoch in range(epochs):
            # Training
            self.model.train()
            train_loss = 0
            train_progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Training]')
            
            for batch in train_progress_bar:
                # Move batch to device
                input_ids = batch['article_input_ids'].to(self.device)
                attention_mask = batch['article_attention_mask'].to(self.device)
                labels = batch['summary_input_ids'].to(self.device)
                decoder_attention_mask = batch['summary_attention_mask'].to(self.device)
                
                # Clear gradients
                optimizer.zero_grad()
                
                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    decoder_attention_mask=decoder_attention_mask
                )
                
                loss = outputs.loss
                train_loss += loss.item()
                
                # Backward pass
                loss.backward()
                
                # Clip gradients
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                
                # Update parameters
                optimizer.step()
                scheduler.step()
                
                # Update progress bar
                train_progress_bar.set_postfix({'loss': loss.item()})
            
            avg_train_loss = train_loss / len(train_loader)
            
            # Validation
            val_loss, rouge_scores = self.evaluate(val_loader)
            
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {avg_train_loss:.4f}")
            print(f"  Val Loss: {val_loss:.4f}")
            print(f"  Rouge1: {rouge_scores['rouge1']:.4f}")
            print(f"  Rouge2: {rouge_scores['rouge2']:.4f}")
            print(f"  RougeL: {rouge_scores['rougeL']:.4f}")
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = copy.deepcopy(self.model.state_dict())
                print(f"  New best model saved with validation loss: {val_loss:.4f}")
        
        # Load best model
        if best_model is not None:
            self.model.load_state_dict(best_model)
            print(f"Loaded best model with validation loss: {best_val_loss:.4f}")
    
    def evaluate(self, data_loader, max_length=128, num_beams=4):
        self.model.eval()
        val_loss = 0
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for batch in tqdm(data_loader, desc="Evaluating"):
                # Move batch to device
                input_ids = batch['article_input_ids'].to(self.device)
                attention_mask = batch['article_attention_mask'].to(self.device)
                labels = batch['summary_input_ids'].to(self.device)
                decoder_attention_mask = batch['summary_attention_mask'].to(self.device)
                
                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    decoder_attention_mask=decoder_attention_mask
                )
                
                val_loss += outputs.loss.item()
                
                # Generate summaries
                generated_ids = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=max_length,
                    num_beams=num_beams,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                    early_stopping=True
                )
                
                # Decode generated summaries and reference summaries
                preds = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
                targets = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
                
                all_preds.extend(preds)
                all_targets.extend(targets)
        
        # Calculate average validation loss
        avg_val_loss = val_loss / len(data_loader)
        
        # Calculate ROUGE scores
        rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
        for pred, target in zip(all_preds, all_targets):
            scores = self.rouge_scorer.score(target, pred)
            rouge_scores['rouge1'] += scores['rouge1'].fmeasure
            rouge_scores['rouge2'] += scores['rouge2'].fmeasure
            rouge_scores['rougeL'] += scores['rougeL'].fmeasure
        
        # Calculate average ROUGE scores
        for key in rouge_scores:
            rouge_scores[key] /= len(all_preds)
        
        return avg_val_loss, rouge_scores
    
    def predict(self, article, max_length=128, num_beams=4):
        self.model.eval()
        
        # Preprocess article
        article = preprocess_text(article)
        article = clean_article_heading(article)
        
        # Tokenize article
        inputs = self.tokenizer(
            article,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(self.device)
        
        # Generate summary
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=max_length,
                num_beams=num_beams,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )
        
        # Decode summary
        summary = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        
        return summary
    
    def save_model(self, path):
        torch.save({
            'model_state_dict': self.model.state_dict(),
        }, path)
        self.tokenizer.save_pretrained(path + "_tokenizer")
        print(f"Model saved to {path}")
    
    def load_model(self, path):
        checkpoint = torch.load(path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.tokenizer = PegasusTokenizer.from_pretrained(path + "_tokenizer")
        print(f"Model loaded from {path}")

In [13]:
# Cell 6: Initialize the model
pegasus_model = PegasusForSummarization(device=device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def train_pegasus_model(model, epochs=3, learning_rate=3e-5):
    # Training parameters
    WARMUP_STEPS = 500
    WEIGHT_DECAY = 0.01
    
    print("Starting training...")
    model.train(
        train_loader=train_loader, 
        val_loader=val_loader, 
        epochs=epochs, 
        learning_rate=learning_rate,
        warmup_steps=WARMUP_STEPS,
        weight_decay=WEIGHT_DECAY
    )
    
    return model

In [15]:
# Cell 8: Evaluation function
def evaluate_pegasus_model(model):
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_loss, test_rouge_scores = model.evaluate(test_loader)
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Rouge1: {test_rouge_scores['rouge1']:.4f}")
    print(f"Test Rouge2: {test_rouge_scores['rouge2']:.4f}")
    print(f"Test RougeL: {test_rouge_scores['rougeL']:.4f}")
    
    return test_loss, test_rouge_scores

In [16]:
def predict_sample_summaries(model, samples=5):
    print("\nGenerating sample summaries:")
    for i in range(samples):
        idx = np.random.randint(0, len(test_df))
        article = test_df.iloc[idx]['article']
        original_summary = test_df.iloc[idx]['summary']
        
        # Generate summary
        generated_summary = model.predict(article)
        
        print(f"\nSample {i+1}:")
        print(f"Original Summary: {original_summary}")
        print(f"Generated Summary: {generated_summary}")
        
        # Calculate ROUGE scores
        scores = model.rouge_scorer.score(original_summary, generated_summary)
        print(f"Rouge1: {scores['rouge1'].fmeasure:.4f}")
        print(f"Rouge2: {scores['rouge2'].fmeasure:.4f}")
        print(f"RougeL: {scores['rougeL'].fmeasure:.4f}")

In [18]:
# Cell 10: Example of how to run the training and evaluation
# Uncomment and run the following lines when ready to train
trained_model = train_pegasus_model(pegasus_model, epochs=3, learning_rate=3e-5)
test_loss, test_rouge_scores = evaluate_pegasus_model(trained_model)
trained_model.save_model("pegasus_summarization_model")

# Cell 11: Example of making predictions with the trained model
# Uncomment and run the following line when ready to make predictions
# predict_sample_summaries(trained_model, samples=3)

Starting training...


Epoch 1/3 [Training]:   0%|          | 0/1250 [00:00<?, ?it/s]

: 