# Post Date Distribution Analysis

This notebook analyzes the distribution of posts across dates, identifying dates with and without posts,
and calculating post frequency statistics. It generates visualizations and reports to help understand
posting patterns over time.

## Features
- Analyzes post distribution across dates for individual sources:
  - Identifies gaps in posting (dates without posts)
  - Calculates daily post frequency statistics
  - Analyzes posts with significant discussions (configurable reply threshold)
  - Supports thread identification by URL or title
- Generates cross-platform comparisons:
  - Aggregates post frequencies across all sources
  - Creates unified daily activity heatmaps
- Creates detailed visualizations including:
  - Timeline of posts per source
  - Post frequency heatmaps
  - Gap analysis visualization
  - Combined source activity heatmaps
- Outputs organized in timestamped directories:
  - `/outputs/[source]_[timestamp]/images/` - All visualizations
  - `/outputs/[source]_[timestamp]/data/` - CSV data files
  - `/outputs/[source]_[timestamp]/reports/` - Markdown analysis reports
  - `/outputs/source_comparisons/` - Cross-source analysis visualizations

## Requirements
- Elasticsearch connection (configured via environment variables)
- Python packages: pandas, matplotlib, seaborn, calendar

## Setup

In [1]:
from scraper.config import settings
from scraper.outputs import ElasticsearchOutput
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path

def setup_output_directories(slug):
    """
    Create nested output directory structure for a given analysis slug.
    Returns a dictionary of paths for different output types.
    """
    # Create base outputs directory
    base_dir = Path('outputs')
    base_dir.mkdir(exist_ok=True)
    
    # Create analysis-specific directory with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    analysis_dir = base_dir / f"{slug}_{timestamp}"
    analysis_dir.mkdir(exist_ok=True)
    
    # Create subdirectories for different output types
    images_dir = analysis_dir / 'images'
    images_dir.mkdir(exist_ok=True)
    
    data_dir = analysis_dir / 'data'
    data_dir.mkdir(exist_ok=True)
    
    reports_dir = analysis_dir / 'reports'
    reports_dir.mkdir(exist_ok=True)
    
    return {
        'base': analysis_dir,
        'images': images_dir,
        'data': data_dir,
        'reports': reports_dir
    }

async def get_thread_sizes(es_output, domain, thread_identifier='thread_url'):
    """
    Get the number of replies for each thread in the domain.
    
    Args:
        es_output: ElasticsearchOutput instance
        domain: Domain to analyze
        thread_identifier: Either 'thread_url' or 'title' to determine how threads are identified
    
    Returns:
        Dictionary mapping thread identifier to reply count
    """
    if thread_identifier not in ['thread_url', 'title']:
        raise ValueError("thread_identifier must be either 'thread_url' or 'title'")
    
    field = f"{thread_identifier}.keyword"
    
    # Build the query based on thread identification method
    must_conditions = [{"term": {"domain.keyword": domain}}]
    if thread_identifier == 'thread_url':
        must_conditions.append({"exists": {"field": "thread_url"}})
    else:
        must_conditions.append({"exists": {"field": "title"}})
    
    query = {
        "size": 0,
        "query": {
            "bool": {
                "must": must_conditions
            }
        },
        "aggs": {
            "threads": {
                "terms": {
                    "field": field,
                    "size": 10000
                }
            }
        }
    }
    
    results = es_output.es.search(index=settings.DEFAULT_INDEX, body=query)
    thread_sizes = {
        bucket['key']: bucket['doc_count'] - 1  # Subtract 1 to exclude the original post
        for bucket in results['aggregations']['threads']['buckets']
    }
    
    return thread_sizes

async def fetch_post_dates(domain, min_replies=None, thread_identifier='thread_url'):
    """
    Fetch posts from Elasticsearch and extract their dates.
    
    Args:
        domain: The domain to fetch posts from
        min_replies: Optional minimum number of replies to filter threads
        thread_identifier: Either 'thread_url' or 'title' to determine how threads are identified
    """
    es_output = ElasticsearchOutput()
    await es_output._initialize()

    # Get thread sizes if filtering by replies
    thread_sizes = None
    if min_replies is not None:
        thread_sizes = await get_thread_sizes(es_output, domain, thread_identifier)
        active_threads = [thread_id for thread_id, count in thread_sizes.items() if count >= min_replies]
        
        # Build query based on thread identification method
        if thread_identifier == 'thread_url':
            thread_filter = {"terms": {"url.keyword": active_threads}}
        else:  # title
            thread_filter = {"terms": {"title.keyword": active_threads}}
        
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"term": {"type.keyword": "original_post"}},
                        {"term": {"domain.keyword": domain}},
                        {"exists": {"field": "created_at"}},
                        thread_filter
                    ]
                }
            },
            "size": 10000,
            "sort": [{"created_at": "asc"}]
        }
    else:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"term": {"type.keyword": "original_post"}},
                        {"term": {"domain.keyword": domain}},
                        {"exists": {"field": "created_at"}}
                    ]
                }
            },
            "size": 10000,
            "sort": [{"created_at": "asc"}]
        }

    results = es_output.es.search(index=settings.DEFAULT_INDEX, body=query)

    data = []
    for hit in results['hits']['hits']:
        doc = hit['_source']
        created_at = datetime.fromisoformat(doc['created_at'].replace('Z', '+00:00'))
        data.append({
            'id': hit['_id'],
            'created_at': created_at.date(),
            'url': doc.get('url', 'unknown'),
            'title': doc.get('title', 'unknown'),
            'author': doc.get('authors', ['unknown'])[0] if doc.get('authors') else 'unknown'
        })

    await es_output._cleanup()
    
    return pd.DataFrame(data)

def analyze_date_distribution(df, min_replies=None):
    """
    Analyze the distribution of posts across dates.
    """
    # Get date range
    date_range = {
        'start': df['created_at'].min(),
        'end': df['created_at'].max(),
        'total_days': (df['created_at'].max() - df['created_at'].min()).days + 1
    }
    
    # Calculate posts per date
    posts_per_date = df.groupby('created_at').size()
    
    # Generate complete date range and identify gaps
    all_dates = pd.date_range(date_range['start'], date_range['end'], freq='D')
    date_status = pd.Series(0, index=all_dates)
    date_status[posts_per_date.index] = posts_per_date
    
    # Calculate statistics
    stats = {
        'total_posts': len(df),
        'total_days': date_range['total_days'],
        'days_with_posts': len(posts_per_date),
        'days_without_posts': date_range['total_days'] - len(posts_per_date),
        'max_posts_per_day': posts_per_date.max(),
        'avg_posts_per_day': len(df) / date_range['total_days'],
        'avg_posts_on_active_days': len(df) / len(posts_per_date)
    }
    
    return date_range, posts_per_date, date_status, stats

def generate_timeline_plot(date_status, output_path):
    """
    Generate a timeline plot showing posts per day.
    """
    plt.figure(figsize=(15, 6))
    
    # Plot the data
    plt.bar(date_status.index, date_status.values, alpha=0.6)
    
    # Customize the plot
    title_suffix = f" (Posts with {min_replies}+ replies)" if min_replies else ""
    plt.title(f'Posts per Day Timeline - {source.upper()}{title_suffix}')
    plt.xlabel('Date')
    plt.ylabel('Number of Posts')
    plt.grid(True, alpha=0.3)
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

def generate_heatmap(date_status, output_path):
    """
    Generate a calendar heatmap of posting activity.
    """
    # Prepare data for heatmap
    data = []
    for date, count in date_status.items():
        data.append({
            'year': date.year,
            'month': date.month,
            'day': date.day,
            'count': count
        })
    
    df_heatmap = pd.DataFrame(data)
    
    # Create pivot table for heatmap
    pivot_table = df_heatmap.pivot_table(
        values='count',
        index=['year', 'month'],
        columns='day',
        aggfunc='sum'
    )
    
    # Generate heatmap
    plt.figure(figsize=(15, 8))
    # Determine whether to show annotations based on the size of the data
    n_rows, n_cols = pivot_table.shape
    total_cells = n_rows * n_cols
    
    # Calculate approximate cell size in inches
    fig_size_inches = (15, 8)  # from figsize parameter
    cell_height = fig_size_inches[1] / n_rows
    cell_width = fig_size_inches[0] / n_cols
    cell_size = min(cell_height, cell_width)
    
    # Show annotations only if cells are large enough (threshold can be adjusted)
    show_annot = cell_size >= 0.3
    
    sns.heatmap(
        pivot_table,
        cmap='YlOrRd',
        annot=show_annot,
        fmt='.0f',
        cbar_kws={'label': 'Number of Posts'},
        # Add these parameters for better readability when annotations are shown
        annot_kws={'size': 8} if show_annot else {}
    )
    
    title_suffix = f" (Posts with {min_replies}+ replies)" if min_replies else ""
    plt.title(f'Post Frequency Calendar Heatmap - {source.upper()}{title_suffix}')
    plt.xlabel('Day of Month')
    plt.ylabel('Year-Month')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

def generate_gap_analysis_plot(date_status, output_path):
    """
    Generate a visualization of posting gaps.
    """
    # Find sequences of days without posts
    gaps = []
    current_gap = 0
    current_gap_start = None
    
    for date, count in date_status.items():
        if count == 0:
            if current_gap_start is None:
                current_gap_start = date
            current_gap += 1
        else:
            if current_gap > 0:
                gaps.append({
                    'start': current_gap_start,
                    'length': current_gap
                })
            current_gap = 0
            current_gap_start = None
    
    # Add the last gap if exists
    if current_gap > 0:
        gaps.append({
            'start': current_gap_start,
            'length': current_gap
        })
    
    # Create visualization
    plt.figure(figsize=(15, 6))
    
    for gap in gaps:
        plt.bar(gap['start'], gap['length'], width=1, alpha=0.6)
    
    title_suffix = f" (Posts with {min_replies}+ replies)" if min_replies else ""
    plt.title(f'Posting Gaps Analysis - {source.upper()}{title_suffix}')
    plt.xlabel('Date')
    plt.ylabel('Gap Length (Days)')
    plt.grid(True, alpha=0.3)
    
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

def generate_markdown_report(slug, date_range, stats, df, output_dirs, min_replies=None):
    """
    Generate a markdown report with analysis results.
    """
    filter_note = f" (Posts with {min_replies}+ replies)" if min_replies else ""
    markdown = f"# Post Date Analysis for {slug}{filter_note}\n\n"
    
    markdown += "## Analysis Period\n\n"
    markdown += f"- Start Date: {date_range['start']}\n"
    markdown += f"- End Date: {date_range['end']}\n"
    markdown += f"- Total Days: {date_range['total_days']}\n\n"
    
    markdown += "## Posting Statistics\n\n"
    markdown += f"- Total Posts: {stats['total_posts']}\n"
    markdown += f"- Days with Posts: {stats['days_with_posts']}\n"
    markdown += f"- Days without Posts: {stats['days_without_posts']}\n"
    markdown += f"- Maximum Posts in a Day: {stats['max_posts_per_day']}\n"
    markdown += f"- Average Posts per Day: {stats['avg_posts_per_day']:.2f}\n"
    markdown += f"- Average Posts on Active Days: {stats['avg_posts_on_active_days']:.2f}\n\n"
    
    # Add visualizations
    markdown += "## Visualizations\n\n"
    markdown += "### Posts Timeline\n"
    markdown += "![Timeline Plot](../images/timeline_plot.png)\n\n"
    markdown += "### Post Frequency Heatmap\n"
    markdown += "![Heatmap Plot](../images/heatmap_plot.png)\n\n"
    markdown += "### Posting Gaps Analysis\n"
    markdown += "![Gap Analysis Plot](../images/gap_analysis_plot.png)\n\n"
    
    return markdown

def generate_csv(df):
    """
    Generate CSV output with post date information.
    """
    # Sort DataFrame by date
    df_sorted = df.sort_values(['created_at', 'id'])
    
    # Generate CSV
    return df_sorted.to_csv(index=False)

## Usage

In [None]:
domains = {
    # "delvingbitcoin": "https://delvingbitcoin.org/",
    "bitcoindev": "https://mailing-list.bitcoindevs.xyz/bitcoindev/",
    # "bitcoindev-legacy": "https://lists.linuxfoundation.org/pipermail/bitcoin-dev/",
    # "lightning-dev": "https://lists.linuxfoundation.org/pipermail/lightning-dev/",
}

# Set minimum replies filter (set to None to see all posts)
min_replies = None  # Change this value to adjust the minimum replies threshold
thread_identifier = 'title'  # Use either 'thread_url' or 'title'

print(f"index: {settings.DEFAULT_INDEX}")
print(f"minimum replies filter: {min_replies}")
print("\nStarting analysis for all sources...")

# Process each source
for source, domain in domains.items():
    try:
        print(f"\nProcessing {source}...")
        print(f"domain: {domain}")
        
        # Setup output directories
        output_dirs = setup_output_directories(source)
        
        # Fetch and analyze data
        df = await fetch_post_dates(domain, min_replies, thread_identifier)
        
        if df.empty:
            print(f"No data found for {source}, skipping...")
            continue
            
        date_range, posts_per_date, date_status, stats = analyze_date_distribution(df, min_replies)

        # Adjust title suffix based on configuration
        title_filter = f"{min_replies}+ replies (by {thread_identifier})" if min_replies else ""
        title_suffix = f" (Posts with {title_filter})" if title_filter else ""
        
        # Update titles with filter information if applied
        title_suffix = f" (Posts with {min_replies}+ replies)" if min_replies else ""
        
        # Generate visualizations
        generate_timeline_plot(date_status, output_dirs['images'] / 'timeline_plot.png')
        generate_heatmap(date_status, output_dirs['images'] / 'heatmap_plot.png')
        generate_gap_analysis_plot(date_status, output_dirs['images'] / 'gap_analysis_plot.png')
        
        # Generate and save markdown report
        markdown_report = generate_markdown_report(source, date_range, stats, df, output_dirs, min_replies)
        with open(output_dirs['reports'] / f"{source}_analysis.md", "w") as f:
            f.write(markdown_report)
        
        # Generate and save CSV
        csv_output = generate_csv(df)
        with open(output_dirs['data'] / f"{source}_date_analysis.csv", "w") as f:
            f.write(csv_output)
        
        print(f"Analysis complete for {source}. Files saved in {output_dirs['base']}")
        
    except Exception as e:
        print(f"Error processing {source}: {str(e)}")
        continue

print("\nAll sources processed.")

## Multi-Source Comparison

In [4]:
from pathlib import Path

def load_source_data(csv_paths):
    """
    Load data from multiple CSV files and combine them for comparison.
    
    Args:
        csv_paths: Dict mapping source names to CSV file paths
    
    Returns:
        DataFrame with data from all sources
    """
    all_data = []
    for source, path in csv_paths.items():
        df = pd.read_csv(path)
        df['created_at'] = pd.to_datetime(df['created_at'])
        df['source'] = source
        all_data.append(df)
    
    return pd.concat(all_data, ignore_index=True)

def generate_aggregate_daily_heatmap(combined_df, output_path):
    """
    Generate a heatmap showing total post frequency across all sources per day.
    """
    # Convert to datetime if not already
    combined_df['created_at'] = pd.to_datetime(combined_df['created_at'])
    
    # Create separate columns for year, month, day
    daily_counts = combined_df.copy()
    daily_counts['year'] = daily_counts['created_at'].dt.year
    daily_counts['month'] = daily_counts['created_at'].dt.month
    daily_counts['day'] = daily_counts['created_at'].dt.day
    
    # Group by date components and count
    daily_counts = daily_counts.groupby(['year', 'month', 'day']).size().reset_index(name='count')
    
    # Create pivot table for heatmap
    pivot_table = daily_counts.pivot_table(
        values='count',
        index=['year', 'month'],
        columns='day',
        fill_value=0
    )
    
    # Rename index for better display
    pivot_table.index = [f"{y}-{m:02d}" for y, m in pivot_table.index]
    
    # Set up the plot
    plt.figure(figsize=(20, len(pivot_table.index) * 0.4))
    
    # Create heatmap
    n_rows, n_cols = pivot_table.shape
    cell_height = 20 / n_cols
    show_annot = cell_height >= 0.3
    
    sns.heatmap(
        pivot_table,
        cmap='YlOrRd',
        annot=show_annot,
        fmt='.0f',
        cbar_kws={'label': 'Total Posts Across All Sources'},
        annot_kws={'size': 6} if show_annot else {}
    )
    
    plt.title('Daily Post Frequency (Aggregated Across All Sources)')
    plt.xlabel('Day of Month')
    plt.ylabel('Year-Month')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close()

## Example usage for multi-source comparison

In [None]:
import glob
# Define paths to your CSV files
csv_paths = {
    'BITCOIN-DEV-LEGACY': 'outputs/bitcoindev_YYYYMMDD_HHMMSS/data/bitcoindev-legacy_date_analysis.csv',
    'BITCOIN-DEV': 'outputs/bitcoindev_YYYYMMDD_HHMMSS/data/bitcoindev_date_analysis.csv',
    'LIGHTNING-DEV': 'outputs/lightning-dev_YYYYMMDD_HHMMSS/data/lightning-dev_date_analysis.csv',
    'DELVING-BITCOIN': 'outputs/delvingbitcoin_YYYYMMDD_HHMMSS/data/delvingbitcoin_date_analysis.csv'
}

# Update these paths with your actual CSV file locations
# You can use glob to find the most recent files:

base_dir = Path('outputs')
csv_paths = {
    'BITCOIN-DEV-LEGACY': str(sorted(base_dir.glob('bitcoindev-legacy_*/data/*_date_analysis.csv'))[-1]),
    'BITCOIN-DEV': str(sorted(base_dir.glob('bitcoindev_*/data/*_date_analysis.csv'))[-1]),
    'LIGHTNING-DEV': str(sorted(base_dir.glob('lightning-dev_*/data/*_date_analysis.csv'))[-1]),
    'DELVING-BITCOIN': str(sorted(base_dir.glob('delvingbitcoin_*/data/*_date_analysis.csv'))[-1])
}


# Create comparison directory
comparison_dir = Path('outputs/source_comparisons')
comparison_dir.mkdir(exist_ok=True, parents=True)

# Load and combine data
combined_df = load_source_data(csv_paths)

# Generate comparison heatmap
generate_aggregate_daily_heatmap(
    combined_df,
    comparison_dir / f'source_comparison_heatmap_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png'
)

print(f"Multi-source comparison completed. Files saved in {comparison_dir}")