### DEEP LEARNING

## **TEXT SUMMARIZATION MODEL**
## **BART FINE TUNNING MODEL**

In [None]:
!pip install transformers rouge_score
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer, 
    BartForConditionalGeneration,
    Trainer, 
    TrainingArguments
)
from datasets import load_metric
from tqdm.auto import tqdm
import wandb
import evaluate

# Set random seeds and device
torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
train_df = pd.read_csv('../data/train.csv', header=None)
test_df = pd.read_csv('../data/test.csv', header=None)
val_df = pd.read_csv('../data/validation.csv', header=None)

columns = ['id', 'article', 'summary']
train_df.columns = columns
test_df.columns = columns
val_df.columns = columns

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

def clean_article_heading(article):
    pattern = r'By\s*\.\s*.*?\s*\.\s*PUBLISHED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.\s*\|\s*\.\s*UPDATED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.'
    cleaned_text = re.sub(pattern, '', article)
    return cleaned_text.strip()

train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['article'] = train_df['article'].apply(clean_article_heading)
train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(clean_article_heading)
test_df['summary'] = test_df['summary'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(clean_article_heading)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f"Training dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")
print(f"Validation dataframe shape: {val_df.shape}")

In [None]:
train_df = train_df.sample(n=20000, random_state=42)
test_df = test_df.sample(n=2000, random_state=42)
val_df = val_df.sample(n=2000, random_state=42)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_length=1024):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        article = str(self.articles[idx])
        summary = str(self.summaries[idx])
        
        inputs = self.tokenizer(
            article,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        with self.tokenizer.as_target_tokenizer():
            targets = self.tokenizer(
                summary,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

### **BART BASE**

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base').to(device)

# Create datasets
train_dataset = NewsDataset(
    train_df['article'].values,
    train_df['summary'].values,
    tokenizer
)

val_dataset = NewsDataset(
    val_df['article'].values,
    val_df['summary'].values,
    tokenizer
)

test_dataset = NewsDataset(
    test_df['article'].values,
    test_df['summary'].values,
    tokenizer
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bart-news-summarizer",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    gradient_accumulation_steps=4,
)

# Rouge metric for evaluation
rouge = evaluate.load('rouge')

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    
    # Decode predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    # Compute ROUGE scores
    rouge_scores = rouge.compute(
        predictions=pred_str,
        references=label_str,
        use_stemmer=True
    )
    
    return {
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

In [None]:
test_results = trainer.evaluate(test_dataset)
print("\nTest Results:")
for metric, value in test_results.items():
    print(f"{metric}: {value:.4f}")

# Save the model
trainer.save_model("./bart-news-summarizer-final")


In [None]:
def generate_summary(article):
    inputs = tokenizer(article, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(
        inputs["input_ids"], 
        num_beams=4, 
        min_length=30, 
        max_length=128, 
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test on a sample article
sample_article = test_df['article'].iloc[0]
generated_summary = generate_summary(sample_article)
print("\nSample Summary Generation:")
print("Original Article:", sample_article[:200], "...")
print("Generated Summary:", generated_summary)
print("Original Summary:", test_df['summary'].iloc[0])

### **BART LARGE**