# Long Document Summarization - Data Exploration

This notebook explores the datasets and analyzes document characteristics.

In [None]:
import json
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

# Add parent directory to path
sys.path.append('..')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Processed Data

In [None]:
def load_dataset(dataset_name, split='train'):
    """Load a processed dataset."""
    data_path = Path('../data/processed') / dataset_name / f'{split}.json'
    
    if not data_path.exists():
        print(f"Dataset {dataset_name} not found")
        return None
    
    with open(data_path, 'r') as f:
        data = json.load(f)
    
    return data

# Load datasets
datasets = ['arxiv', 'pubmed', 'multi_news', 'booksum', 'billsum']
loaded_data = {}

for dataset_name in datasets:
    data = load_dataset(dataset_name, 'train')
    if data:
        loaded_data[dataset_name] = data
        print(f"Loaded {dataset_name}: {len(data)} samples")

## 2. Dataset Statistics

In [None]:
# Compute statistics for each dataset
stats = []

for dataset_name, data in loaded_data.items():
    token_counts = [sample.get('token_count', 0) for sample in data]
    num_paragraphs = [sample.get('num_paragraphs', 0) for sample in data]
    num_sentences = [sample.get('num_sentences', 0) for sample in data]
    
    stats.append({
        'Dataset': dataset_name,
        'Samples': len(data),
        'Avg Tokens': np.mean(token_counts),
        'Std Tokens': np.std(token_counts),
        'Min Tokens': np.min(token_counts),
        'Max Tokens': np.max(token_counts),
        'Avg Paragraphs': np.mean(num_paragraphs),
        'Avg Sentences': np.mean(num_sentences),
    })

stats_df = pd.DataFrame(stats)
stats_df

## 3. Visualize Length Distributions

In [None]:
# Plot token count distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, (dataset_name, data) in enumerate(loaded_data.items()):
    if idx >= len(axes):
        break
    
    token_counts = [sample.get('token_count', 0) for sample in data]
    
    axes[idx].hist(token_counts, bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{dataset_name.title()} - Token Distribution')
    axes[idx].set_xlabel('Token Count')
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(np.mean(token_counts), color='r', linestyle='--', label='Mean')
    axes[idx].legend()

plt.tight_layout()
plt.show()

## 4. Analyze Document Structure

In [None]:
# Analyze paragraphs per document
for dataset_name, data in loaded_data.items():
    num_paragraphs = [sample.get('num_paragraphs', 0) for sample in data]
    
    print(f"\n{dataset_name.upper()}:")
    print(f"  Average paragraphs: {np.mean(num_paragraphs):.1f}")
    print(f"  Median paragraphs: {np.median(num_paragraphs):.1f}")
    print(f"  Min-Max: {np.min(num_paragraphs)}-{np.max(num_paragraphs)}")

## 5. Sample Documents

In [None]:
# Show sample from each dataset
for dataset_name, data in loaded_data.items():
    if len(data) > 0:
        sample = data[0]
        
        print(f"\n{'='*80}")
        print(f"Sample from {dataset_name.upper()}")
        print(f"{'='*80}")
        print(f"Tokens: {sample.get('token_count', 0)}")
        print(f"Paragraphs: {sample.get('num_paragraphs', 0)}")
        print(f"Sentences: {sample.get('num_sentences', 0)}")
        
        # Print first paragraph
        if 'paragraphs' in sample and len(sample['paragraphs']) > 0:
            print(f"\nFirst paragraph:")
            print(sample['paragraphs'][0][:500] + '...')
        
        break  # Only show one dataset for brevity

## 6. Save Summary Statistics

In [None]:
# Save statistics to CSV
stats_df.to_csv('../data/processed/dataset_statistics.csv', index=False)
print("Statistics saved to data/processed/dataset_statistics.csv")