# Analysis Tasks - January 2025

This notebook addresses the tasks from the December 2025 meeting:

## Carry-Over
- [ ] **Iron/EC ratio analysis** (Priority item)

## Plot Updates
- [ ] Fix axis scaling on Delhi plots
- [ ] Create matched-sample plots (only samples with ALL three measurements)
- [ ] Remove Delhi green wavelength outliers and replot time series

## Summary
- [ ] Create cross-plot collage (all plot types, all sites)
- [ ] Create summary table with slopes and R² values

## New Analyses
- [ ] Raw attenuation correlation matrix

## Maybe
- [ ] Hourly-binned wavelength ratio analysis for Angstrom absorption exponent

## 1. Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Get the directory where the notebook is located and add scripts to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

# Core imports
from config import SITES, MAC_VALUE
from data_matching import (
    load_aethalometer_data, 
    load_filter_data,
    add_base_filter_id,
    match_by_filter_id,
    match_aeth_filter_data,
    match_hips_with_smooth_raw,
    match_all_parameters  # For getting data with ALL measurements
)
from flow_periods import add_flow_period, print_flow_period_summary
from outliers import (
    apply_exclusion_flags,
    apply_threshold_flags,
    get_clean_data,
    print_exclusion_summary
)
from plotting import PlotConfig, crossplots, timeseries, distributions, comparisons

print("Imports successful!")

In [None]:
# =============================================================================
# UTILITY FUNCTIONS (from Example notebook)
# =============================================================================

SITE_COLORS = {'Beijing': '#1f77b4', 'Delhi': '#ff7f0e', 'JPL': '#2ca02c', 'Addis_Ababa': '#d62728'}

def apply_all_outlier_flags(data_dict, aeth_col='ir_bcc', filter_col='hips_fabs', 
                            convert_to_ng=True, verbose=True):
    """Apply both date-based and threshold-based outlier flags."""
    flagged_data = {}
    for site_name, df in data_dict.items():
        df_flagged = df.copy()
        multiplier = 1000 if convert_to_ng else 1
        df_flagged['aeth_bc'] = df_flagged[aeth_col] * multiplier
        df_flagged['filter_ec'] = df_flagged[filter_col] * multiplier
        df_flagged = apply_exclusion_flags(df_flagged, site_name)
        df_flagged = apply_threshold_flags(df_flagged, site_name)
        df_flagged['is_any_outlier'] = df_flagged['is_excluded'] | df_flagged['is_outlier']
        flagged_data[site_name] = df_flagged
        if verbose:
            print_exclusion_summary(df_flagged, site_name)
    return flagged_data


def filter_common_samples(data_dict, required_cols, verbose=True):
    """Filter to only samples with ALL required columns having data."""
    filtered_data = {}
    for site_name, df in data_dict.items():
        available_cols = [col for col in required_cols if col in df.columns]
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            if verbose:
                print(f"{site_name}: Missing columns {missing_cols}, skipping")
            continue
        mask = df[available_cols].notna().all(axis=1)
        df_common = df[mask].copy()
        filtered_data[site_name] = df_common
        if verbose:
            print(f"{site_name}: {len(df)} total -> {len(df_common)} common ({len(df_common)/len(df)*100:.1f}%)")
    return filtered_data


def get_data_mode(data_dict, mode='all', required_cols=None, verbose=False):
    """Get data in 'all' or 'common' mode."""
    if mode == 'all':
        return data_dict
    elif mode == 'common':
        if required_cols is None:
            required_cols = ['ir_bcc', 'hips_fabs', 'ftir_ec']
        return filter_common_samples(data_dict, required_cols, verbose=verbose)
    else:
        raise ValueError(f"Unknown mode: {mode}")


print("Utility functions defined!")

## 2. Load and Match Data

In [None]:
# Load aethalometer data
aethalometer_data = load_aethalometer_data()

# Load filter data with base_filter_id
filter_data = load_filter_data()
filter_data = add_base_filter_id(filter_data)

print(f"\nFilter data: {len(filter_data)} rows")

In [None]:
# Match HIPS with aethalometer and apply outlier flags
hips_aeth_matched = {}

for site_name, config in SITES.items():
    if site_name not in aethalometer_data:
        continue
    matched = match_hips_with_smooth_raw(
        site_name,
        aethalometer_data[site_name],
        filter_data,
        config['code']
    )
    if matched is not None:
        hips_aeth_matched[site_name] = matched
        print(f"{site_name}: {len(matched)} matched pairs")

# Apply outlier flags
print("\n" + "=" * 60)
print("APPLYING OUTLIER FLAGS")
print("=" * 60)
hips_aeth_matched = apply_all_outlier_flags(hips_aeth_matched)

In [None]:
# Match by FilterId for FTIR/HIPS/Iron analysis
matched_by_filter = {}

for site_name, config in SITES.items():
    matched = match_by_filter_id(
        filter_data, 
        site_code=config['code'],
        params=['EC_ftir', 'HIPS_Fabs', 'ChemSpec_Iron_PM2.5']
    )
    if matched is not None:
        matched['hips_fabs'] = matched['hips_fabs'] / MAC_VALUE
        matched_by_filter[site_name] = matched
        print(f"{site_name}: {len(matched)} filters, columns: {list(matched.columns)}")

In [None]:
# Match ALL parameters (HIPS, FTIR EC, Iron, Aethalometer) for common data analysis
# This is needed for "common data mode" - samples with ALL measurements
all_params_matched = {}

print("\n" + "=" * 60)
print("MATCHING ALL PARAMETERS (HIPS, FTIR EC, Iron, Aethalometer)")
print("=" * 60)

for site_name, config in SITES.items():
    if site_name not in aethalometer_data:
        print(f"{site_name}: No aethalometer data")
        continue
    matched = match_all_parameters(
        site_name, 
        config['code'], 
        aethalometer_data[site_name], 
        filter_data
    )
    if matched is not None:
        all_params_matched[site_name] = matched
        print(f"{site_name}: {len(matched)} matched days, columns: {list(matched.columns)}")
    else:
        print(f"{site_name}: No matched data")

# Iron/EC Ratio Analysis - SEPARATE PLOTS FOR EACH SITE
print("=" * 70)
print("IRON/EC RATIO ANALYSIS")
print("=" * 70)

iron_ec_stats = {}

# Debug: show what columns we have
print("\nChecking matched_by_filter data structure:")
for site_name, df in matched_by_filter.items():
    print(f"  {site_name}: columns = {list(df.columns)}")

# Create SEPARATE plots for each site
for site_name, df in matched_by_filter.items():
    color = SITE_COLORS.get(site_name, '#333333')
    
    # Get iron and EC data - check what columns exist
    iron_col = None
    if 'iron' in df.columns:
        iron_col = 'iron'
    elif 'chemspec_iron_pm2.5' in df.columns:
        iron_col = 'chemspec_iron_pm2.5'
    
    ec_col = 'ftir_ec'
    
    if iron_col is None:
        print(f"\n{site_name}: No iron column found, skipping")
        continue
    if ec_col not in df.columns:
        print(f"\n{site_name}: No FTIR EC column found, skipping")
        continue
    
    # Filter valid data
    valid = df[[iron_col, ec_col]].notna().all(axis=1)
    iron = df.loc[valid, iron_col]
    ec = df.loc[valid, ec_col]
    
    if len(iron) < 3:
        print(f"\n{site_name}: Insufficient data (n={len(iron)})")
        continue
    
    # Calculate ratio
    ratio = iron / ec
    
    # Create figure for this site
    fig, ax = plt.subplots(figsize=(8, 7))
    
    # Scatter plot
    ax.scatter(ec, iron, c=color, alpha=0.7, s=60)
    
    # Regression
    slope, intercept, r_value, _, _ = stats.linregress(ec, iron)
    x_line = np.linspace(0, ec.max() * 1.05, 100)
    ax.plot(x_line, slope * x_line + intercept, 'b-', lw=2,
           label=f'R²={r_value**2:.3f}, slope={slope:.4f}')
    
    # Set axes to start at origin
    ax.set_xlim(0, ec.max() * 1.05)
    ax.set_ylim(0, iron.max() * 1.05)
    
    ax.set_xlabel('FTIR EC (µg/m³)', fontsize=12)
    ax.set_ylabel('Iron (µg/m³)', fontsize=12)
    ax.set_title(f'{site_name} - Iron vs FTIR EC\n(n={len(ec)}, Mean Fe/EC ratio: {ratio.mean():.4f})', fontsize=14)
    ax.legend(loc='upper left')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'Iron_EC_{site_name}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Store stats
    iron_ec_stats[site_name] = {
        'n': len(ec),
        'r2': r_value**2,
        'slope': slope,
        'mean_ratio': ratio.mean(),
        'std_ratio': ratio.std(),
        'median_ratio': ratio.median()
    }
    
    print(f"\n{site_name}: n={len(ec)}, R²={r_value**2:.3f}, Mean Fe/EC={ratio.mean():.4f}")

# Print summary table
print("\n" + "=" * 80)
print("IRON/EC RATIO SUMMARY")
print("=" * 80)
print(f"{'Site':<15} {'N':>6} {'R²':>8} {'Slope':>10} {'Mean Ratio':>12} {'Std':>10}")
print("-" * 80)
for site, s in iron_ec_stats.items():
    print(f"{site:<15} {s['n']:>6} {s['r2']:>8.3f} {s['slope']:>10.4f} {s['mean_ratio']:>12.4f} {s['std_ratio']:>10.4f}")

In [None]:
# Iron/EC Ratio Analysis
print("=" * 70)
print("IRON/EC RATIO ANALYSIS")
print("=" * 70)

iron_ec_stats = {}

fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

for idx, (site_name, df) in enumerate(matched_by_filter.items()):
    if idx >= 4:
        break
    ax = axes[idx]
    color = SITE_COLORS.get(site_name, '#333333')
    
    # Get iron and EC data
    iron_col = 'iron' if 'iron' in df.columns else 'chemspec_iron_pm2.5'
    ec_col = 'ftir_ec'
    
    if iron_col not in df.columns or ec_col not in df.columns:
        ax.text(0.5, 0.5, f'{site_name}\nMissing data', ha='center', va='center')
        continue
    
    # Filter valid data
    valid = df[[iron_col, ec_col]].notna().all(axis=1)
    iron = df.loc[valid, iron_col]
    ec = df.loc[valid, ec_col]
    
    if len(iron) < 3:
        ax.text(0.5, 0.5, f'{site_name}\nInsufficient data', ha='center', va='center')
        continue
    
    # Calculate ratio
    ratio = iron / ec
    
    # Scatter plot
    ax.scatter(ec, iron, c=color, alpha=0.7, s=60)
    
    # Regression
    slope, intercept, r_value, _, _ = stats.linregress(ec, iron)
    x_line = np.linspace(0, ec.max() * 1.05, 100)
    ax.plot(x_line, slope * x_line + intercept, 'b-', lw=2,
           label=f'R²={r_value**2:.3f}, slope={slope:.4f}')
    
    # Set axes to start at origin
    max_val = max(ec.max(), iron.max()) * 1.05
    ax.set_xlim(0, ec.max() * 1.05)
    ax.set_ylim(0, iron.max() * 1.05)
    
    ax.set_xlabel('FTIR EC (µg/m³)')
    ax.set_ylabel('Iron (µg/m³)')
    ax.set_title(f'{site_name} (n={len(ec)})\nMean Fe/EC ratio: {ratio.mean():.4f}')
    ax.legend(loc='upper left')
    ax.grid(True, alpha=0.3)
    
    # Store stats
    iron_ec_stats[site_name] = {
        'n': len(ec),
        'r2': r_value**2,
        'slope': slope,
        'mean_ratio': ratio.mean(),
        'std_ratio': ratio.std(),
        'median_ratio': ratio.median()
    }

plt.tight_layout()
plt.savefig('Iron_EC_Ratio_Analysis.png', dpi=150, bbox_inches='tight')
plt.show()

# Print summary table
print("\nIron/EC Ratio Summary:")
print("-" * 80)
print(f"{'Site':<15} {'N':>6} {'R²':>8} {'Slope':>10} {'Mean Ratio':>12} {'Std':>10}")
print("-" * 80)
for site, s in iron_ec_stats.items():
    print(f"{site:<15} {s['n']:>6} {s['r2']:>8.3f} {s['slope']:>10.4f} {s['mean_ratio']:>12.4f} {s['std_ratio']:>10.4f}")

---
# TASK 2: Delhi Plot Fixes

- Fix axis scaling (currently extends to 60 when max is ~20)
- Remove green wavelength outliers (4-7 points) and replot

In [None]:
# Delhi-specific analysis with fixed axis scaling
if 'Delhi' in hips_aeth_matched:
    delhi_data = hips_aeth_matched['Delhi']
    
    print("Delhi Data Summary:")
    print(f"  Total samples: {len(delhi_data)}")
    print(f"  IR BCc range: {delhi_data['ir_bcc'].min():.2f} - {delhi_data['ir_bcc'].max():.2f}")
    print(f"  HIPS range: {delhi_data['hips_fabs'].min():.2f} - {delhi_data['hips_fabs'].max():.2f}")
    
    # Check for green wavelength outliers
    if 'green_bcc' in delhi_data.columns:
        green_mean = delhi_data['green_bcc'].mean()
        green_std = delhi_data['green_bcc'].std()
        green_outliers = delhi_data['green_bcc'] > (green_mean + 3 * green_std)
        print(f"\n  Green wavelength outliers (>3σ): {green_outliers.sum()}")
        if green_outliers.any():
            print(f"  Outlier values: {delhi_data.loc[green_outliers, 'green_bcc'].values}")
else:
    print("Delhi data not available")

In [None]:
# Delhi crossplot with FIXED axis scaling
if 'Delhi' in hips_aeth_matched:
    delhi_data = hips_aeth_matched['Delhi']
    clean_delhi = get_clean_data(delhi_data)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Get data
    x = clean_delhi['ir_bcc'].dropna()
    y = clean_delhi.loc[x.index, 'hips_fabs'].dropna()
    common_idx = x.index.intersection(y.index)
    x, y = clean_delhi.loc[common_idx, 'ir_bcc'], clean_delhi.loc[common_idx, 'hips_fabs']
    
    # Calculate proper axis limit (not 60!)
    max_val = max(x.max(), y.max()) * 1.05
    print(f"Proper max value: {max_val:.2f} (not 60)")
    
    # Left plot: Before (showing the problem)
    ax = axes[0]
    ax.scatter(x, y, c='#ff7f0e', alpha=0.7, s=60)
    ax.set_xlim(0, 60)  # OLD: bad scaling
    ax.set_ylim(0, 60)
    ax.plot([0, 60], [0, 60], 'k--', alpha=0.5)
    ax.set_xlabel('Aethalometer IR BCc (µg/m³)')
    ax.set_ylabel('HIPS Fabs / MAC (µg/m³)')
    ax.set_title('Delhi - BEFORE (bad axis scaling to 60)')
    ax.grid(True, alpha=0.3)
    
    # Right plot: After (fixed)
    ax = axes[1]
    ax.scatter(x, y, c='#ff7f0e', alpha=0.7, s=60)
    
    if len(x) > 2:
        slope, intercept, r_value, _, _ = stats.linregress(x, y)
        x_line = np.linspace(0, max_val, 100)
        ax.plot(x_line, slope * x_line + intercept, 'b-', lw=2,
               label=f'R²={r_value**2:.3f}, slope={slope:.2f}')
    
    ax.set_xlim(0, max_val)  # FIXED: proper scaling
    ax.set_ylim(0, max_val)
    ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, label='1:1')
    ax.set_xlabel('Aethalometer IR BCc (µg/m³)')
    ax.set_ylabel('HIPS Fabs / MAC (µg/m³)')
    ax.set_title(f'Delhi - AFTER (proper scaling to {max_val:.1f})')
    ax.legend(loc='upper left')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('Delhi_Axis_Fix.png', dpi=150, bbox_inches='tight')
    plt.show()

# Get COMMON data (samples with ALL three measurements)
# Using all_params_matched which has HIPS, FTIR EC, Iron, and Aethalometer data
print("=" * 70)
print("MATCHED-SAMPLE ANALYSIS (Common Data Mode)")
print("=" * 70)
print("\nUsing all_params_matched data (from match_all_parameters):")

# Show what we have
for site_name, df in all_params_matched.items():
    has_cols = [col for col in ['ir_bcc', 'hips_fabs', 'ftir_ec', 'iron'] if col in df.columns]
    print(f"  {site_name}: {len(df)} samples, columns: {has_cols}")

# Filter to samples with ALL three key measurements
data_common = {}
required_cols = ['ir_bcc', 'hips_fabs', 'ftir_ec']

print("\nFiltering to samples with ALL three measurements (Aeth, HIPS, FTIR EC):")
for site_name, df in all_params_matched.items():
    available = [col for col in required_cols if col in df.columns]
    if len(available) < len(required_cols):
        missing = [col for col in required_cols if col not in df.columns]
        print(f"  {site_name}: Missing {missing}, skipping")
        continue
    
    # Filter to rows with all required columns having data
    mask = df[required_cols].notna().all(axis=1)
    df_common = df[mask].copy()
    
    if len(df_common) > 0:
        data_common[site_name] = df_common
        print(f"  {site_name}: {len(df)} total -> {len(df_common)} common samples")
    else:
        print(f"  {site_name}: No common samples")

print(f"\nSites with common data: {list(data_common.keys())}")

In [None]:
# Create SEPARATE matched-sample crossplots for each site
if len(data_common) == 0:
    print("No sites have common data!")
else:
    for site_name, df in data_common.items():
        color = SITE_COLORS.get(site_name, '#333333')
        
        # Get data
        x = df['ir_bcc'].dropna()
        y = df['hips_fabs'].dropna()
        common_idx = x.index.intersection(y.index)
        x, y = df.loc[common_idx, 'ir_bcc'], df.loc[common_idx, 'hips_fabs']
        
        if len(x) < 3:
            print(f"{site_name}: Insufficient data (n={len(x)})")
            continue
        
        # Create figure for this site
        fig, ax = plt.subplots(figsize=(8, 7))
        
        ax.scatter(x, y, c=color, alpha=0.7, s=60)
        
        # Regression
        slope, intercept, r_value, _, _ = stats.linregress(x, y)
        max_val = max(x.max(), y.max()) * 1.05
        x_line = np.linspace(0, max_val, 100)
        ax.plot(x_line, slope * x_line + intercept, 'b-', lw=2,
               label=f'R²={r_value**2:.3f}, slope={slope:.2f}')
        ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, label='1:1')
        
        ax.set_xlim(0, max_val)
        ax.set_ylim(0, max_val)
        ax.set_xlabel('Aethalometer IR BCc (µg/m³)', fontsize=12)
        ax.set_ylabel('HIPS Fabs / MAC (µg/m³)', fontsize=12)
        ax.set_title(f'{site_name} - HIPS vs Aethalometer\n(COMMON DATA: n={len(x)}, all 3 measurements)', fontsize=14)
        ax.legend(loc='upper left')
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'Common_HIPS_vs_Aeth_{site_name}.png', dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"{site_name}: n={len(x)}, R²={r_value**2:.3f}, slope={slope:.2f}")

In [None]:
---
# TASK 4: Cross-Plots (All Sites, All Types) - SEPARATE GRAPHS

Create individual plots for each site showing:
1. HIPS vs Aethalometer
2. FTIR EC vs Aethalometer  
3. HIPS vs FTIR EC

# Create SEPARATE cross-plots for each site and plot type
all_stats = []

plot_configs = [
    {'x': 'ir_bcc', 'y': 'hips_fabs', 'xlabel': 'Aethalometer IR BCc (µg/m³)', 
     'ylabel': 'HIPS Fabs / MAC (µg/m³)', 'title': 'HIPS vs Aethalometer'},
    {'x': 'ir_bcc', 'y': 'ftir_ec', 'xlabel': 'Aethalometer IR BCc (µg/m³)', 
     'ylabel': 'FTIR EC (µg/m³)', 'title': 'FTIR EC vs Aethalometer'},
    {'x': 'ftir_ec', 'y': 'hips_fabs', 'xlabel': 'FTIR EC (µg/m³)', 
     'ylabel': 'HIPS Fabs / MAC (µg/m³)', 'title': 'HIPS vs FTIR EC'},
]

if len(data_common) == 0:
    print("No common data available!")
else:
    for site_name, df in data_common.items():
        color = SITE_COLORS.get(site_name, '#333333')
        print(f"\n{'='*60}")
        print(f"{site_name}")
        print('='*60)
        
        for config in plot_configs:
            # Check columns exist
            if config['x'] not in df.columns or config['y'] not in df.columns:
                print(f"  {config['title']}: Missing columns, skipping")
                continue
            
            # Get valid data
            valid = df[[config['x'], config['y']]].notna().all(axis=1)
            x = df.loc[valid, config['x']]
            y = df.loc[valid, config['y']]
            
            if len(x) < 3:
                print(f"  {config['title']}: Insufficient data (n={len(x)})")
                continue
            
            # Create figure
            fig, ax = plt.subplots(figsize=(8, 7))
            
            ax.scatter(x, y, c=color, alpha=0.7, s=60)
            
            # Regression
            slope, intercept, r_value, _, _ = stats.linregress(x, y)
            max_val = max(x.max(), y.max()) * 1.05
            x_line = np.linspace(0, max_val, 100)
            ax.plot(x_line, slope * x_line + intercept, 'b-', lw=2,
                   label=f'R²={r_value**2:.3f}, slope={slope:.2f}')
            ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, label='1:1')
            
            ax.set_xlim(0, max_val)
            ax.set_ylim(0, max_val)
            ax.set_xlabel(config['xlabel'], fontsize=12)
            ax.set_ylabel(config['ylabel'], fontsize=12)
            ax.set_title(f"{site_name} - {config['title']}\n(n={len(x)}, R²={r_value**2:.3f})", fontsize=14)
            ax.legend(loc='upper left')
            ax.grid(True, alpha=0.3)
            
            # Save with descriptive filename
            safe_title = config['title'].replace(' ', '_').replace('/', '_')
            plt.tight_layout()
            plt.savefig(f"{safe_title}_{site_name}.png", dpi=150, bbox_inches='tight')
            plt.show()
            
            # Store stats
            all_stats.append({
                'Site': site_name,
                'Plot': config['title'],
                'N': len(x),
                'R²': r_value**2,
                'Slope': slope,
                'Intercept': intercept
            })
            
            print(f"  {config['title']}: n={len(x)}, R²={r_value**2:.3f}, slope={slope:.2f}")

# Create stats DataFrame
stats_df = pd.DataFrame(all_stats)
print(f"\nTotal plots created: {len(all_stats)}")

In [None]:
# Cross-plot collage: 3 plot types x 4 sites
def create_crossplot_collage(data_dict, save_name='Crossplot_Collage.png'):
    """
    Create a collage with 3 rows (plot types) x N columns (sites).
    
    Row 1: HIPS vs Aethalometer
    Row 2: FTIR EC vs Aethalometer
    Row 3: HIPS vs FTIR EC
    """
    sites = list(data_dict.keys())
    n_sites = len(sites)
    
    plot_configs = [
        {'x': 'ir_bcc', 'y': 'hips_fabs', 'xlabel': 'Aeth IR BCc', 'ylabel': 'HIPS/MAC', 'title': 'HIPS vs Aeth'},
        {'x': 'ir_bcc', 'y': 'ftir_ec', 'xlabel': 'Aeth IR BCc', 'ylabel': 'FTIR EC', 'title': 'FTIR vs Aeth'},
        {'x': 'ftir_ec', 'y': 'hips_fabs', 'xlabel': 'FTIR EC', 'ylabel': 'HIPS/MAC', 'title': 'HIPS vs FTIR'},
    ]
    
    fig, axes = plt.subplots(3, n_sites, figsize=(4*n_sites, 12))
    if n_sites == 1:
        axes = axes.reshape(-1, 1)
    
    all_stats = []
    
    for row, config in enumerate(plot_configs):
        for col, site_name in enumerate(sites):
            ax = axes[row, col]
            df = data_dict[site_name]
            color = SITE_COLORS.get(site_name, '#333333')
            
            # Get clean data
            if 'is_any_outlier' in df.columns:
                df_clean = df[~df['is_any_outlier']]
            else:
                df_clean = df
            
            # Check columns exist
            if config['x'] not in df_clean.columns or config['y'] not in df_clean.columns:
                ax.text(0.5, 0.5, 'Missing data', ha='center', va='center', transform=ax.transAxes)
                continue
            
            # Get valid data
            valid = df_clean[[config['x'], config['y']]].notna().all(axis=1)
            x = df_clean.loc[valid, config['x']]
            y = df_clean.loc[valid, config['y']]
            
            if len(x) < 3:
                ax.text(0.5, 0.5, f'n={len(x)}', ha='center', va='center', transform=ax.transAxes)
                continue
            
            # Plot
            ax.scatter(x, y, c=color, alpha=0.7, s=40)
            
            # Regression
            slope, intercept, r_value, _, _ = stats.linregress(x, y)
            max_val = max(x.max(), y.max()) * 1.05
            x_line = np.linspace(0, max_val, 100)
            ax.plot(x_line, slope * x_line + intercept, 'b-', lw=1.5)
            ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.4, lw=1)
            
            ax.set_xlim(0, max_val)
            ax.set_ylim(0, max_val)
            ax.set_xlabel(config['xlabel'], fontsize=9)
            ax.set_ylabel(config['ylabel'], fontsize=9)
            
            # Title with stats
            if row == 0:
                ax.set_title(f'{site_name}\nR²={r_value**2:.2f}, m={slope:.2f}', fontsize=10)
            else:
                ax.set_title(f'R²={r_value**2:.2f}, m={slope:.2f}', fontsize=9)
            
            ax.grid(True, alpha=0.3)
            
            # Store stats
            all_stats.append({
                'Site': site_name,
                'Plot': config['title'],
                'N': len(x),
                'R²': r_value**2,
                'Slope': slope,
                'Intercept': intercept
            })
    
    # Row labels on the left
    for row, config in enumerate(plot_configs):
        axes[row, 0].set_ylabel(f"{config['title']}\n{config['ylabel']}", fontsize=10)
    
    plt.tight_layout()
    plt.savefig(save_name, dpi=150, bbox_inches='tight')
    plt.show()
    
    return pd.DataFrame(all_stats)

# Create collage
stats_df = create_crossplot_collage(data_common, save_name='Crossplot_Collage_Common.png')

# Display summary table
print("=" * 80)
print("SUMMARY TABLE: Slopes and R² Values (Common Data Only)")
print("=" * 80)

if len(stats_df) > 0:
    print(stats_df.to_string(index=False))
    
    # Save to CSV
    stats_df.to_csv('Crossplot_Summary_Stats.csv', index=False)
    print("\nSaved to: Crossplot_Summary_Stats.csv")
else:
    print("No statistics to display (no common data found)")

In [None]:
# Pivot table format for cleaner view
if len(stats_df) > 0:
    pivot = stats_df.pivot_table(
        index='Site', 
        columns='Plot', 
        values=['R²', 'Slope', 'N'],
        aggfunc='first'
    )
    print("\nPivot Table View:")
    print(pivot.round(3))
else:
    print("No data for pivot table")

In [None]:
# Pivot table format for cleaner view
pivot = stats_df.pivot_table(
    index='Site', 
    columns='Plot', 
    values=['R²', 'Slope', 'N'],
    aggfunc='first'
)
print("\nPivot Table View:")
print(pivot.round(3))

---
# TASK 6: Raw Attenuation Correlation Matrix

Analyze correlations in raw attenuation data (before instrument processing) to see if high wavelength correlations are inherent or processing-induced.

In [None]:
# Create SEPARATE correlation matrices for each site
wavelength_cols = ['uv_bcc', 'blue_bcc', 'green_bcc', 'red_bcc', 'ir_bcc']

for site_name, df in aethalometer_data.items():
    # Check which wavelength columns exist
    available_cols = [col for col in wavelength_cols if col in df.columns]
    
    if len(available_cols) < 2:
        print(f"\n{site_name}: Insufficient wavelength data ({len(available_cols)} columns)")
        continue
    
    # Calculate correlation matrix
    corr_matrix = df[available_cols].corr()
    
    # Create figure for this site
    fig, ax = plt.subplots(figsize=(8, 7))
    
    # Plot heatmap
    im = ax.imshow(corr_matrix, cmap='RdYlBu_r', vmin=0.5, vmax=1.0)
    
    # Labels
    labels = [col.replace('_bcc', '').upper() for col in available_cols]
    ax.set_xticks(range(len(labels)))
    ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=11)
    ax.set_yticklabels(labels, fontsize=11)
    
    # Add correlation values
    for i in range(len(labels)):
        for j in range(len(labels)):
            val = corr_matrix.iloc[i, j]
            color = 'white' if val < 0.7 else 'black'
            ax.text(j, i, f'{val:.2f}', ha='center', va='center', color=color, fontsize=12)
    
    ax.set_title(f'{site_name}\nBC Wavelength Correlation Matrix', fontsize=14)
    plt.colorbar(im, ax=ax, label='Correlation')
    
    plt.tight_layout()
    plt.savefig(f'Wavelength_Correlation_{site_name}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n{site_name}: Correlation matrix created ({len(available_cols)} wavelengths)")

In [None]:
# Create correlation matrices for BC wavelengths
# Common BC columns: UV, Blue, Green, Red, IR (or similar)

wavelength_cols = ['uv_bcc', 'blue_bcc', 'green_bcc', 'red_bcc', 'ir_bcc']

fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

for idx, (site_name, df) in enumerate(aethalometer_data.items()):
    if idx >= 4:
        break
    ax = axes[idx]
    
    # Check which wavelength columns exist
    available_cols = [col for col in wavelength_cols if col in df.columns]
    
    if len(available_cols) < 2:
        ax.text(0.5, 0.5, f'{site_name}\nInsufficient wavelength data', 
               ha='center', va='center', transform=ax.transAxes)
        continue
    
    # Calculate correlation matrix
    corr_matrix = df[available_cols].corr()
    
    # Plot heatmap
    im = ax.imshow(corr_matrix, cmap='RdYlBu_r', vmin=0.5, vmax=1.0)
    
    # Labels
    labels = [col.replace('_bcc', '').upper() for col in available_cols]
    ax.set_xticks(range(len(labels)))
    ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_yticklabels(labels)
    
    # Add correlation values
    for i in range(len(labels)):
        for j in range(len(labels)):
            val = corr_matrix.iloc[i, j]
            color = 'white' if val < 0.7 else 'black'
            ax.text(j, i, f'{val:.2f}', ha='center', va='center', color=color, fontsize=10)
    
    ax.set_title(f'{site_name}\nWavelength Correlation Matrix')
    plt.colorbar(im, ax=ax, label='Correlation')

plt.tight_layout()
plt.savefig('Wavelength_Correlation_Matrix.png', dpi=150, bbox_inches='tight')
plt.show()

---
# OPTIONAL: Hourly-binned Wavelength Ratio Analysis

For Angstrom absorption exponent analysis.

In [None]:
---
# Summary of Outputs

Files generated (separate plots for each site):
- `Iron_EC_{site}.png` - Iron/EC scatter plots
- `Delhi_Axis_Fix.png` - Before/after axis scaling comparison  
- `Common_HIPS_vs_Aeth_{site}.png` - HIPS vs Aeth (common data only)
- `HIPS_vs_Aethalometer_{site}.png` - All cross-plots
- `FTIR_EC_vs_Aethalometer_{site}.png`
- `HIPS_vs_FTIR_EC_{site}.png`
- `Crossplot_Summary_Stats.csv` - Slopes and R² values
- `Wavelength_Correlation_{site}.png` - BC wavelength correlations

---
# Summary of Outputs

Files generated:
1. `Iron_EC_Ratio_Analysis.png` - Iron/EC scatter plots
2. `Delhi_Axis_Fix.png` - Before/after axis scaling comparison
3. `Matched_Sample_Crossplots.png` - Common data only plots
4. `Crossplot_Collage_Common.png` - All 3 plot types for all sites
5. `Crossplot_Summary_Stats.csv` - Slopes and R² values
6. `Wavelength_Correlation_Matrix.png` - BC wavelength correlations

In [None]:
# Final summary
print("=" * 70)
print("TASK COMPLETION SUMMARY")
print("=" * 70)
print("")
print("[✓] Iron/EC ratio analysis - COMPLETED")
print("[✓] Delhi axis scaling fix - COMPLETED")
print("[✓] Matched-sample plots (common data) - COMPLETED")
print("[✓] Cross-plot collage - COMPLETED")
print("[✓] Summary table with slopes/R² - COMPLETED")
print("[✓] Wavelength correlation matrix - COMPLETED")
print("[ ] Hourly-binned wavelength ratio - OPTIONAL (not implemented)")
print("")
print("Administrative:")
print("[ ] Send when-is-good poll for first two weeks of January")