# Paraphrase Generation System - Complete Pipeline

This notebook demonstrates the complete pipeline for the Paraphrase Generation System, including:

1. **Data Cleaning & Preprocessing** - Loading and preparing PAWS dataset
2. **Model Training** - Fine-tuning FLAN-T5 with LoRA
3. **Paraphrase Generation** - Generating paraphrases from both CPG and LLM models
4. **Score Comparison** - Comparing models using BLEU, ROUGE, and semantic similarity metrics

## Setup and Imports

In [None]:
import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = os.path.dirname(os.path.abspath(os.getcwd()))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Core imports
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'}")

---
## 1. Data Cleaning & Preprocessing

We use a high-quality paraphrase dataset:
- **PAWS** (Paraphrase Adversaries from Word Scrambling): Challenging paraphrase pairs with high lexical overlap

### 1.1 Text Preprocessor

In [None]:
import re
from typing import List, Optional

class TextPreprocessor:
    """Handles text preprocessing for paraphrase generation."""
    
    def __init__(self, min_words: int = 10, max_words: int = 400, remove_special_chars: bool = False):
        self.min_words = min_words
        self.max_words = max_words
        self.remove_special_chars = remove_special_chars
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text."""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        # Optionally remove special characters
        if self.remove_special_chars:
            text = re.sub(r'[^\w\s.,!?;:\'"()-]', '', text)
        return text
    
    def count_words(self, text: str) -> int:
        return len(text.split())
    
    def is_valid_length(self, text: str) -> bool:
        word_count = self.count_words(text)
        return self.min_words <= word_count <= self.max_words
    
    def truncate_text(self, text: str, max_words: Optional[int] = None) -> str:
        max_words = max_words or self.max_words
        words = text.split()
        if len(words) > max_words:
            words = words[:max_words]
        return ' '.join(words)
    
    def prepare_for_model(self, text: str, prefix: str = "paraphrase: ") -> str:
        text = self.clean_text(text)
        if self.max_words:
            text = self.truncate_text(text)
        return f"{prefix}{text}"

# Initialize preprocessor
preprocessor = TextPreprocessor(min_words=10, max_words=400)
print("‚úÖ TextPreprocessor initialized")

### 1.2 Load Datasets

In [None]:
def load_paws_dataset(max_samples: Optional[int] = None):
    """Load PAWS dataset with positive paraphrase pairs only."""
    print("Loading PAWS dataset...")
    dataset = load_dataset("paws", "labeled_final", split="train", trust_remote_code=True)
    # Filter for paraphrase pairs (label=1)
    dataset = dataset.filter(lambda x: x["label"] == 1)
    if max_samples:
        dataset = dataset.select(range(min(max_samples, len(dataset))))
    print(f"  ‚úÖ Loaded {len(dataset)} positive paraphrase pairs from PAWS")
    return {"source": dataset["sentence1"], "target": dataset["sentence2"]}

# Load dataset (using small sample for demo)
MAX_SAMPLES = 1000  # Adjust for full training

paws_data = load_paws_dataset(max_samples=MAX_SAMPLES)

### 1.3 Data Cleaning

In [None]:
def clean_and_filter_data(data, preprocessor):
    """Clean and filter the dataset."""
    cleaned_sources = []
    cleaned_targets = []
    
    for src, tgt in zip(data["source"], data["target"]):
        # Clean texts
        clean_src = preprocessor.clean_text(src)
        clean_tgt = preprocessor.clean_text(tgt)
        
        # Filter by length
        if len(clean_src.split()) >= 5 and len(clean_tgt.split()) >= 5:
            cleaned_sources.append(clean_src)
            cleaned_targets.append(clean_tgt)
    
    return {"source": cleaned_sources, "target": cleaned_targets}

# Clean dataset
print("\nüìù Cleaning dataset...")
paws_clean = clean_and_filter_data(paws_data, preprocessor)

print(f"  PAWS: {len(paws_data['source'])} ‚Üí {len(paws_clean['source'])} samples")

# Use cleaned dataset
all_sources = paws_clean["source"]
all_targets = paws_clean["target"]

print(f"\n‚úÖ Total samples: {len(all_sources)}")

### 1.4 Explore Cleaned Data

In [None]:
# Show sample pairs
print("üìã Sample Paraphrase Pairs:\n")
for i in range(3):
    print(f"Pair {i+1}:")
    print(f"  Source: {all_sources[i][:100]}...")
    print(f"  Target: {all_targets[i][:100]}...")
    print()

### 1.5 Create Train/Validation/Test Splits

In [None]:
from datasets import Dataset, DatasetDict
import random

# Shuffle data
random.seed(42)
indices = list(range(len(all_sources)))
random.shuffle(indices)

shuffled_sources = [all_sources[i] for i in indices]
shuffled_targets = [all_targets[i] for i in indices]

# Split: 80% train, 10% validation, 10% test
n = len(shuffled_sources)
train_end = int(0.8 * n)
val_end = int(0.9 * n)

train_data = Dataset.from_dict({
    "source": shuffled_sources[:train_end],
    "target": shuffled_targets[:train_end]
})

val_data = Dataset.from_dict({
    "source": shuffled_sources[train_end:val_end],
    "target": shuffled_targets[train_end:val_end]
})

test_data = Dataset.from_dict({
    "source": shuffled_sources[val_end:],
    "target": shuffled_targets[val_end:]
})

datasets = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data
})

print(f"üìä Dataset Splits:")
print(f"  Train: {len(datasets['train'])} samples")
print(f"  Validation: {len(datasets['validation'])} samples")
print(f"  Test: {len(datasets['test'])} samples")

---
## 2. Model Training

We fine-tune FLAN-T5-base using LoRA (Low-Rank Adaptation) for parameter-efficient training.

### 2.1 Initialize Base Model with LoRA

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType

MODEL_NAME = "google/flan-t5-base"

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Apply LoRA configuration
lora_config = LoraConfig(
    r=16,                          # LoRA rank
    lora_alpha=32,                 # LoRA alpha
    lora_dropout=0.1,             # Dropout
    target_modules=["q", "v"],    # Target attention modules
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Move to device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)
print(f"\n‚úÖ Model loaded on {device}")

### 2.2 Create PyTorch Dataset

In [None]:
from torch.utils.data import Dataset as TorchDataset
from typing import Dict

class ParaphraseDataset(TorchDataset):
    """PyTorch Dataset for paraphrase generation."""
    
    def __init__(self, dataset, tokenizer, max_input_length=512, max_output_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
        self.preprocessor = TextPreprocessor()
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
        item = self.dataset[idx]
        source = self.preprocessor.prepare_for_model(item["source"], "paraphrase: ")
        target = item["target"]
        
        # Tokenize input
        source_encoding = self.tokenizer(
            source, max_length=self.max_input_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )
        
        # Tokenize target
        target_encoding = self.tokenizer(
            target, max_length=self.max_output_length,
            padding="max_length", truncation=True, return_tensors="pt"
        )
        
        labels = target_encoding["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels
        }

# Create datasets
train_dataset = ParaphraseDataset(datasets["train"], tokenizer)
val_dataset = ParaphraseDataset(datasets["validation"], tokenizer)

print(f"‚úÖ Created PyTorch datasets")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")

### 2.3 Training Configuration

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, EarlyStoppingCallback

# Training arguments
training_args = TrainingArguments(
    output_dir="../outputs/checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    logging_dir="../outputs/logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    eval_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
    remove_unused_columns=False
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

print("‚úÖ Training configuration set")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")

### 2.4 Train the Model

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("üöÄ Starting training...")
print(f"  Training samples: {len(train_dataset)}")
print(f"  Validation samples: {len(val_dataset)}")
print()

# Train (uncomment to run full training)
# train_result = trainer.train()
# print("\n‚úÖ Training completed!")

# For demo, we'll skip training and load pretrained model
print("‚ö†Ô∏è Training skipped for demo. Uncomment above to run full training.")

### 2.5 Save Trained Model

In [None]:
# Save the model (after training)
SAVE_PATH = "../outputs/checkpoints/final_model"

# Uncomment after training:
# model.save_pretrained(SAVE_PATH)
# tokenizer.save_pretrained(SAVE_PATH)
# print(f"‚úÖ Model saved to {SAVE_PATH}")

print(f"Model would be saved to: {SAVE_PATH}")

---
## 3. Paraphrase Generation

Generate paraphrases using both models:
- **CPG (Custom Paraphrase Generator)**: Fine-tuned FLAN-T5-base with LoRA
- **LLM Baseline**: FLAN-T5-large (zero-shot)

### 3.1 Load Models

In [None]:
# For this demo, we'll use the models from the src package
from src.models.cpg_model import CustomParaphraseGenerator
from src.models.llm_baseline import LLMBaseline

# Check if fine-tuned model exists
FINETUNED_PATH = "../outputs/checkpoints/final_model"

if os.path.exists(FINETUNED_PATH):
    print("Loading fine-tuned CPG model...")
    cpg_model = CustomParaphraseGenerator.load(FINETUNED_PATH)
else:
    print("Fine-tuned model not found. Using base model with LoRA...")
    cpg_model = CustomParaphraseGenerator(
        model_name="google/flan-t5-base",
        use_lora=True
    )

print("\nLoading LLM Baseline (FLAN-T5-large)...")
llm_model = LLMBaseline(model_name="google/flan-t5-large")

print("\n‚úÖ Both models loaded successfully!")

### 3.2 Sample Text for Paraphrasing

In [None]:
# Sample text (cover letter excerpt)
sample_text = """
A cover letter is a formal document that accompanies your resume when you apply for a job. 
It serves as an introduction and provides additional context for your application. 
The primary purpose of a cover letter is to introduce yourself to the hiring manager 
and to provide context for your resume. It allows you to elaborate on your qualifications, 
skills, and experiences in a way that your resume may not fully capture. It's also an 
opportunity to express your enthusiasm for the role and the company, and to explain why 
you would be a good fit.
""".strip()

# Clean the sample
sample_text = preprocessor.clean_text(sample_text)

print(f"üìÑ Sample Text ({len(sample_text.split())} words):")
print("-" * 70)
print(sample_text)

### 3.3 Generate Paraphrases - CPG Model

In [None]:
print("üîÑ Generating paraphrase with CPG (Fine-tuned FLAN-T5)...")
print()

# Measure latency
start_time = time.perf_counter()
cpg_paraphrase = cpg_model.generate(
    sample_text,
    max_length=512,
    min_length_ratio=0.8,
    num_beams=4,
    temperature=0.7,
    length_penalty=2.0
)
cpg_latency = time.perf_counter() - start_time

print(f"üìù CPG Output ({len(cpg_paraphrase.split())} words):")
print("-" * 70)
print(cpg_paraphrase)
print(f"\n‚è±Ô∏è Latency: {cpg_latency:.3f}s")

### 3.4 Generate Paraphrases - LLM Baseline

In [None]:
print("üîÑ Generating paraphrase with LLM Baseline (FLAN-T5-large)...")
print()

# Measure latency
start_time = time.perf_counter()
llm_paraphrase = llm_model.generate(
    sample_text,
    max_length=512,
    num_beams=4,
    temperature=0.7,
    length_penalty=1.0
)
llm_latency = time.perf_counter() - start_time

print(f"üìù LLM Output ({len(llm_paraphrase.split())} words):")
print("-" * 70)
print(llm_paraphrase)
print(f"\n‚è±Ô∏è Latency: {llm_latency:.3f}s")

### 3.5 Side-by-Side Comparison

In [None]:
print("=" * 80)
print("PARAPHRASE COMPARISON")
print("=" * 80)

print(f"\nüìÑ ORIGINAL ({len(sample_text.split())} words):")
print("-" * 80)
print(sample_text[:300] + "..." if len(sample_text) > 300 else sample_text)

print(f"\nüîµ CPG Model ({len(cpg_paraphrase.split())} words, {cpg_latency:.3f}s):")
print("-" * 80)
print(cpg_paraphrase[:300] + "..." if len(cpg_paraphrase) > 300 else cpg_paraphrase)

print(f"\nüü¢ LLM Baseline ({len(llm_paraphrase.split())} words, {llm_latency:.3f}s):")
print("-" * 80)
print(llm_paraphrase[:300] + "..." if len(llm_paraphrase) > 300 else llm_paraphrase)

---
## 4. Score Comparison & Evaluation

Compare the models using multiple metrics:
- **BLEU Score**: Measures n-gram precision
- **ROUGE Score**: Measures recall of n-grams
- **Semantic Similarity**: TF-IDF based cosine similarity
- **Length Ratio**: Output length preservation

### 4.1 Initialize Metrics Calculator

In [None]:
import evaluate as hf_evaluate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class MetricsCalculator:
    """Calculate evaluation metrics for paraphrase generation."""
    
    def __init__(self):
        self.bleu_scorer = hf_evaluate.load("sacrebleu")
        self.rouge_scorer = hf_evaluate.load("rouge")
    
    def calculate_bleu(self, predictions, references):
        """Calculate BLEU score."""
        refs = [[ref] for ref in references]
        result = self.bleu_scorer.compute(predictions=predictions, references=refs)
        return {"bleu": result["score"]}
    
    def calculate_rouge(self, predictions, references):
        """Calculate ROUGE scores."""
        result = self.rouge_scorer.compute(predictions=predictions, references=references)
        return {
            "rouge1": result["rouge1"],
            "rouge2": result["rouge2"],
            "rougeL": result["rougeL"]
        }
    
    def calculate_semantic_similarity(self, predictions, references):
        """Calculate semantic similarity using TF-IDF."""
        similarities = []
        for pred, ref in zip(predictions, references):
            vectorizer = TfidfVectorizer()
            try:
                tfidf_matrix = vectorizer.fit_transform([ref, pred])
                sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
                similarities.append(sim)
            except ValueError:
                similarities.append(0.0)
        return {"semantic_similarity": np.mean(similarities)}
    
    def calculate_length_ratio(self, predictions, references):
        """Calculate length preservation ratio."""
        ratios = []
        for pred, ref in zip(predictions, references):
            pred_len = len(pred.split())
            ref_len = len(ref.split())
            if ref_len > 0:
                ratios.append(pred_len / ref_len)
        return {
            "avg_length_ratio": np.mean(ratios),
            "length_preservation_rate": sum(1 for r in ratios if r >= 0.8) / len(ratios)
        }
    
    def calculate_all(self, predictions, references):
        """Calculate all metrics."""
        results = {}
        results.update(self.calculate_bleu(predictions, references))
        results.update(self.calculate_rouge(predictions, references))
        results.update(self.calculate_semantic_similarity(predictions, references))
        results.update(self.calculate_length_ratio(predictions, references))
        return results

metrics_calc = MetricsCalculator()
print("‚úÖ Metrics Calculator initialized")

### 4.2 Calculate Metrics for Both Models

In [None]:
# Calculate metrics for single sample
references = [sample_text]
cpg_predictions = [cpg_paraphrase]
llm_predictions = [llm_paraphrase]

print("üìä Calculating metrics for CPG model...")
cpg_metrics = metrics_calc.calculate_all(cpg_predictions, references)

print("üìä Calculating metrics for LLM model...")
llm_metrics = metrics_calc.calculate_all(llm_predictions, references)

print("\n‚úÖ Metrics calculated!")

### 4.3 Display Comparison Results

In [None]:
# Create comparison DataFrame
comparison_data = {
    "Metric": [],
    "CPG (Fine-tuned)": [],
    "LLM Baseline": [],
    "Winner": []
}

for metric in cpg_metrics.keys():
    cpg_val = cpg_metrics[metric]
    llm_val = llm_metrics[metric]
    
    comparison_data["Metric"].append(metric)
    comparison_data["CPG (Fine-tuned)"].append(f"{cpg_val:.4f}")
    comparison_data["LLM Baseline"].append(f"{llm_val:.4f}")
    
    # Determine winner (higher is better for most metrics)
    if metric == "avg_length_ratio":
        # Closer to 1.0 is better
        winner = "CPG" if abs(cpg_val - 1.0) < abs(llm_val - 1.0) else "LLM"
    else:
        winner = "CPG" if cpg_val > llm_val else "LLM" if llm_val > cpg_val else "Tie"
    comparison_data["Winner"].append(winner)

# Add latency comparison
comparison_data["Metric"].append("latency (s)")
comparison_data["CPG (Fine-tuned)"].append(f"{cpg_latency:.4f}")
comparison_data["LLM Baseline"].append(f"{llm_latency:.4f}")
comparison_data["Winner"].append("CPG" if cpg_latency < llm_latency else "LLM")

# Add speedup
speedup = llm_latency / cpg_latency if cpg_latency > 0 else 0
comparison_data["Metric"].append("speedup")
comparison_data["CPG (Fine-tuned)"].append(f"{speedup:.2f}x")
comparison_data["LLM Baseline"].append("1.00x")
comparison_data["Winner"].append("CPG" if speedup > 1 else "LLM")

comparison_df = pd.DataFrame(comparison_data)

print("=" * 80)
print("MODEL COMPARISON RESULTS")
print("=" * 80)
print()
print(comparison_df.to_string(index=False))

### 4.4 Visualize Results

In [None]:
import matplotlib.pyplot as plt

# Extract numeric metrics for plotting
plot_metrics = ["bleu", "rouge1", "rouge2", "rougeL", "semantic_similarity"]
cpg_values = [cpg_metrics[m] for m in plot_metrics]
llm_values = [llm_metrics[m] for m in plot_metrics]

# Create bar chart
x = np.arange(len(plot_metrics))
width = 0.35

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Quality Metrics
ax1 = axes[0]
bars1 = ax1.bar(x - width/2, cpg_values, width, label='CPG (Fine-tuned)', color='#2196F3')
bars2 = ax1.bar(x + width/2, llm_values, width, label='LLM Baseline', color='#4CAF50')

ax1.set_xlabel('Metric')
ax1.set_ylabel('Score')
ax1.set_title('Text Quality Metrics Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels([m.upper() for m in plot_metrics], rotation=45, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Latency Comparison
ax2 = axes[1]
latencies = [cpg_latency, llm_latency]
bars = ax2.bar(['CPG (Fine-tuned)', 'LLM Baseline'], latencies, 
               color=['#2196F3', '#4CAF50'])
ax2.set_ylabel('Latency (seconds)')
ax2.set_title('Latency Comparison')
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bar, lat in zip(bars, latencies):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{lat:.3f}s', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../outputs/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nüìä Visualization saved to outputs/model_comparison.png")

### 4.5 Summary

In [None]:
# Count wins
cpg_wins = comparison_df['Winner'].value_counts().get('CPG', 0)
llm_wins = comparison_df['Winner'].value_counts().get('LLM', 0)

print("=" * 80)
print("SUMMARY")
print("=" * 80)
print()
print(f"üìä Overall Results:")
print(f"   CPG (Fine-tuned) wins: {cpg_wins} metrics")
print(f"   LLM Baseline wins: {llm_wins} metrics")
print()
print(f"‚ö° Performance:")
print(f"   CPG Latency: {cpg_latency:.3f}s")
print(f"   LLM Latency: {llm_latency:.3f}s")
print(f"   Speedup: {speedup:.2f}x faster")
print()
print(f"üìù Length Preservation:")
print(f"   Original: {len(sample_text.split())} words")
print(f"   CPG Output: {len(cpg_paraphrase.split())} words ({cpg_metrics['avg_length_ratio']:.1%} ratio)")
print(f"   LLM Output: {len(llm_paraphrase.split())} words ({llm_metrics['avg_length_ratio']:.1%} ratio)")
print()
print("=" * 80)
print("‚úÖ Pipeline Complete!")
print("=" * 80)

---
## 5. Batch Evaluation on Test Set

In [None]:
# Evaluate on multiple test samples
NUM_TEST_SAMPLES = 10  # Adjust for more comprehensive evaluation

test_sources = datasets['test']['source'][:NUM_TEST_SAMPLES]
test_targets = datasets['test']['target'][:NUM_TEST_SAMPLES]

print(f"üìä Evaluating on {NUM_TEST_SAMPLES} test samples...\n")

# Generate paraphrases from both models
print("Generating CPG paraphrases...")
cpg_outputs = [cpg_model.generate(text) for text in test_sources]

print("Generating LLM paraphrases...")
llm_outputs = [llm_model.generate(text) for text in test_sources]

# Calculate metrics
print("\nCalculating metrics...")
cpg_batch_metrics = metrics_calc.calculate_all(cpg_outputs, test_sources)
llm_batch_metrics = metrics_calc.calculate_all(llm_outputs, test_sources)

print("\n" + "=" * 60)
print(f"BATCH EVALUATION RESULTS ({NUM_TEST_SAMPLES} samples)")
print("=" * 60)
print(f"{'Metric':<25} {'CPG':<15} {'LLM':<15}")
print("-" * 60)
for metric in cpg_batch_metrics.keys():
    print(f"{metric:<25} {cpg_batch_metrics[metric]:<15.4f} {llm_batch_metrics[metric]:<15.4f}")