# Summary Efficiency Analysis

This notebook analyzes the efficiency of text summarization by comparing summary lengths to original content lengths across different posts and threads. It generates comprehensive visualizations and statistics to evaluate summarization performance.

## Features
- Calculates and visualizes summary-to-content length ratios
- Generates thread-specific and overall statistics
- Creates detailed plots including:
  - Distribution of summary ratios
  - Thread comparison scatter plots
  - Individual thread analysis plots
- Outputs organized in timestamped directories:
  - `/outputs/[source]_[timestamp]/images/` - All visualizations
  - `/outputs/[source]_[timestamp]/data/` - CSV data files
  - `/outputs/[source]_[timestamp]/reports/` - Markdown analysis reports

## Requirements
- Elasticsearch connection (configured via environment variables)
- Python packages: pandas, matplotlib, seaborn

## Setup

In [2]:
from scraper.config import settings
from scraper.outputs import ElasticsearchOutput
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from datetime import datetime
from pathlib import Path

def setup_output_directories(slug):
    """
    Create nested output directory structure for a given analysis slug.
    Returns a dictionary of paths for different output types.
    """
    # Create base outputs directory
    base_dir = Path('outputs')
    base_dir.mkdir(exist_ok=True)
    
    # Create analysis-specific directory with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    analysis_dir = base_dir / f"{slug}_{timestamp}"
    analysis_dir.mkdir(exist_ok=True)
    
    # Create subdirectories for different output types
    images_dir = analysis_dir / 'images'
    images_dir.mkdir(exist_ok=True)
    
    data_dir = analysis_dir / 'data'
    data_dir.mkdir(exist_ok=True)
    
    reports_dir = analysis_dir / 'reports'
    reports_dir.mkdir(exist_ok=True)
    
    return {
        'base': analysis_dir,
        'images': images_dir,
        'data': data_dir,
        'reports': reports_dir
    }

async def analyze_post_summaries(domain):
    es_output = ElasticsearchOutput()
    await es_output._initialize()

    query = {
        "query": {
            "bool": {
                "must": [
                    {"exists": {"field": "body"}},
                    {"exists": {"field": "summary"}},
                    {"term": {"domain.keyword": domain}}
                ],
                "must_not": [
                    {"term": {"type.keyword": "combined-summary"}}
                ]
            }
        },
        "size": 10000
    }

    results = await es_output.es.search(index=settings.DEFAULT_INDEX, body=query)

    data = []
    for hit in results['hits']['hits']:
        doc = hit['_source']
        body_length = len(doc['body'].split())
        summary_length = len(doc['summary'].split())
        
        if body_length > 0:
            ratio = summary_length / body_length
            data.append({
                'id': hit['_id'],
                'body_length': body_length,
                'summary_length': summary_length,
                'ratio': ratio,
                'type': doc.get('type', 'unknown'),
                'thread_url': doc.get('thread_url', 'unknown'),
                'author': doc.get('authors', ['unknown'])[0] if doc.get('authors') else 'unknown'
            })

    await es_output._cleanup()
    
    return pd.DataFrame(data)

def analyze_data(df):
    overall_stats = {
        'avg_ratio': df['ratio'].mean(),
        'median_ratio': df['ratio'].median(),
        'count': len(df)
    }

    # Sort threads by post count
    thread_counts = df['thread_url'].value_counts()
    
    # Calculate per-thread statistics, sorted by post count
    thread_stats = df.groupby('thread_url').agg({
        'ratio': ['mean', 'median', 'count'],
        'body_length': ['mean', 'median'],
        'summary_length': ['mean', 'median']
    }).loc[thread_counts.index]  # This line sorts the thread_stats by post count

    # Add a numbered index to thread_stats
    thread_stats = thread_stats.reset_index()
    thread_stats.index = range(1, len(thread_stats) + 1)
    thread_stats.index.name = 'Thread Number'

    type_stats = df.groupby('type').agg({
        'ratio': ['mean', 'median', 'count'],
        'body_length': ['mean', 'median'],
        'summary_length': ['mean', 'median']
    })

    ratio_ranges = [0, 0.5, 1, 1.5, 2, float('inf')]
    labels = [f"{ratio_ranges[i]}-{ratio_ranges[i+1]}" for i in range(len(ratio_ranges)-1)]
    df['ratio_range'] = pd.cut(df['ratio'], bins=ratio_ranges, labels=labels, include_lowest=True)
    distribution = df['ratio_range'].value_counts().sort_index()

    return overall_stats, type_stats, thread_stats, distribution, df


def set_plot_limits(df, ax, padding=0.1):
    max_x = df['body_length'].max()
    max_y = df['summary_length'].max()
    limit = max(max_x, max_y) * (1 + padding)
    ax.set_xlim(0, limit)
    ax.set_ylim(0, limit)
    return limit


def generate_thread_comparison_plot(df, output_path):
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Use log scale for both axes
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    # Plot the data points
    sns.scatterplot(data=df, x='body_length', y='summary_length', hue='thread_url', 
                    palette='deep', legend=False, alpha=0.6, ax=ax)
    
    # Set limits based on data
    min_x, max_x = df['body_length'].min(), df['body_length'].max()
    min_y, max_y = df['summary_length'].min(), df['summary_length'].max()
    ax.set_xlim(max(1, min_x/2), max_x*2)
    ax.set_ylim(max(1, min_y/2), max_y*2)
    
    # Add 1:1 ratio line
    lims = [
        max(ax.get_xlim()[0], ax.get_ylim()[0]),
        min(ax.get_xlim()[1], ax.get_ylim()[1]),
    ]
    ax.plot(lims, lims, 'r--', alpha=0.5, zorder=0, label='1:1 Ratio')
    
    # Add other ratio lines
    ax.plot(lims, [l*0.5 for l in lims], 'k:', alpha=0.5, zorder=0, linewidth=2, label='0.5:1 Ratio')
    ax.plot(lims, [l*2 for l in lims], 'g-.', alpha=0.5, zorder=0, linewidth=2, label='2:1 Ratio')
    
    # Customize the plot
    ax.set_title('Body Length vs Summary Length Across Threads (Log Scale)')
    ax.set_xlabel('Body Length (words)')
    ax.set_ylabel('Summary Length (words)')
    ax.legend(title='Ratio Lines', loc='upper left')
    
    # Add grid
    ax.grid(True, which="both", ls="-", alpha=0.2)
    
    # Add annotations for extreme and minimum points
    extremes = pd.concat([
        df.nlargest(5, 'body_length'),
        df.nlargest(5, 'summary_length'),
        df.nsmallest(5, 'body_length'),
        df.nsmallest(5, 'summary_length')
    ]).drop_duplicates()
    
    for _, row in extremes.iterrows():
        ax.annotate(f"{row['body_length']},{row['summary_length']}", 
                    (row['body_length'], row['summary_length']),
                    xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

def calculate_thread_statistics(df):
    thread_stats = df.groupby('thread_url').agg({
        'ratio': ['mean', 'median'],
        'author': 'first'  # Assuming the first post's author is the original author
    }).reset_index()
    thread_stats.columns = ['thread_url', 'mean_ratio', 'median_ratio', 'original_author']
    return thread_stats

def generate_multi_thread_plot(df, output_path, max_threads=16):
    # Calculate thread statistics
    thread_stats = calculate_thread_statistics(df)
    
    # Sort threads by post count and filter out single-post threads
    thread_counts = df['thread_url'].value_counts()
    multi_post_threads = thread_counts[thread_counts > 1]
    top_threads = multi_post_threads.nlargest(max_threads).index

    # Filter dataframe to include only top threads
    df_top = df[df['thread_url'].isin(top_threads)]

    # Calculate grid size
    grid_size = math.ceil(math.sqrt(len(top_threads)))

    fig, axes = plt.subplots(grid_size, grid_size, figsize=(4*grid_size, 4*grid_size))
    fig.suptitle('Body Length vs Summary Length per Thread (Top Threads)', fontsize=16)

    for i, (ax, thread) in enumerate(zip(axes.flatten(), top_threads)):
        thread_df = df_top[df_top['thread_url'] == thread]
        thread_stat = thread_stats[thread_stats['thread_url'] == thread].iloc[0]
        
        # Determine original author
        original_author = thread_stat['original_author']
        
        # Plot points, differentiating original author and original post
        for j, (index, row) in enumerate(thread_df.iterrows()):
            if j == 0:  # Original post
                color = 'green'
                label = 'Original Post'
            elif row['author'] == original_author:
                color = 'red'
                label = 'Original Author'
            else:
                color = 'blue'
                label = None
            
            ax.scatter(row['body_length'], row['summary_length'], 
                       c=color, alpha=0.6, label=label)
        
        limit = set_plot_limits(thread_df, ax)
        
        # Add 1:1 ratio line
        ax.plot([0, limit], [0, limit], 'g--', alpha=0.5, label='1:1 Ratio')
        
        # Add mean ratio line (more subtle)
        mean_ratio = thread_stat['mean_ratio']
        ax.plot([0, limit], [0, limit * mean_ratio], 'r:', alpha=0.3, label=f'Mean Ratio: {mean_ratio:.2f}')
        
        ax.set_title(f'Thread {i+1}: {thread_counts[thread]} posts\nMean Ratio: {mean_ratio:.2f}', fontsize=10)
        ax.set_xlabel('Body Length', fontsize=8)
        ax.set_ylabel('Summary Length', fontsize=8)
        ax.tick_params(labelsize=6)
        
        # Add legend with all desired items
        handles, labels = ax.get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        desired_labels = ['Original Post', 'Original Author', '1:1 Ratio', f'Mean Ratio: {mean_ratio:.2f}']
        ax.legend([by_label[label] for label in desired_labels if label in by_label],
                  [label for label in desired_labels if label in by_label],
                  fontsize=6, loc='upper left')

    # Remove any unused subplots
    for j in range(i+1, grid_size**2):
        fig.delaxes(axes.flatten()[j])

    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

def generate_markdown_report(slug, overall_stats, type_stats, thread_stats, distribution, df, output_dirs):
    markdown = f"# Post Summary Analysis for {slug}\n\n"
    
    markdown += "## Overall Statistics\n\n"
    markdown += f"- Average summary/body length ratio: {overall_stats['avg_ratio']:.2f}\n"
    markdown += f"- Median summary/body length ratio: {overall_stats['median_ratio']:.2f}\n"
    markdown += f"- Number of posts analyzed: {overall_stats['count']}\n\n"
    
    markdown += "## Per-type Statistics\n\n"
    markdown += type_stats.to_markdown() + "\n\n"
    
    markdown += "## Distribution of summary/body length ratios\n\n"
    for range, count in distribution.items():
        percentage = (count / overall_stats['count']) * 100
        markdown += f"- {range}: {count} ({percentage:.2f}%)\n"
    markdown += "\n"
    
    # Generate plots
    plt.figure(figsize=(10, 6))
    distribution.plot(kind='bar')
    plt.title('Distribution of Summary/Body Length Ratios')
    plt.xlabel('Ratio Range')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(output_dirs['images'] / 'distribution_plot.png')
    plt.close()
    
    generate_thread_comparison_plot(df, output_dirs['images'] / 'thread_comparison_plot.png')
    generate_multi_thread_plot(df, output_dirs['images'] / 'multi_thread_plot.png', max_threads=48)
    
    # Add images to markdown using relative paths
    markdown += "## Visualizations\n\n"
    markdown += "### Distribution of Summary/Body Length Ratios\n"
    markdown += "![Distribution Plot](../images/distribution_plot.png)\n\n"
    markdown += "### Body Length vs Summary Length Across Threads\n"
    markdown += "![Thread Comparison Plot](../images/thread_comparison_plot.png)\n\n"
    markdown += "### Body Length vs Summary Length per Thread (Top Threads)\n"
    markdown += "![Multi-Thread Plot](../images/multi_thread_plot.png)\n\n"
    
    markdown += "## Per-thread Statistics\n\n"
    markdown += thread_stats.to_markdown() + "\n\n"
    
    return markdown

def generate_csv(df):
    # Sort the DataFrame by thread_url to group posts from the same thread together
    df_sorted = df.sort_values(['thread_url', 'id'])
    
    # Generate CSV from the sorted DataFrame
    return df_sorted.to_csv(index=False)

## Usage

In [None]:
domains = {
    "delvingbitcoin": "https://delvingbitcoin.org/",
    "bitcoindev": "https://mailing-list.bitcoindevs.xyz/bitcoindev/",
    "lightning-dev": "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
}
source = "lightning-dev" # Select the domain you want to analyze
domain = domains[source]

print(f"domain: {domain}")
print(f"index: {settings.DEFAULT_INDEX}")

# setup output directories
output_dirs = setup_output_directories(source)

df = await analyze_post_summaries(domain)


In [None]:
overall_stats, type_stats, thread_stats, distribution, full_df = analyze_data(df)
# Generate and save markdown report
markdown_report = generate_markdown_report(source, overall_stats, type_stats, thread_stats, distribution, full_df, output_dirs)
with open(output_dirs['reports'] / f"{source}_analysis.md", "w") as f:
    f.write(markdown_report)

In [None]:
# Generate and save CSV
csv_output = generate_csv(full_df)
with open(output_dirs['data'] / f"{source}_thread_analysis.csv", "w") as f:
    f.write(csv_output)

print(f"Analysis complete. Files saved in {output_dirs['base']}")