# Multi-Site Aethalometer Analysis (Modular Version)

This notebook uses the modularized scripts for multi-site aethalometer and filter data analysis.

**Sites:**
- Beijing, China (CHTS)
- Delhi, India (INDH)
- JPL/Pasadena, USA (USPA)
- Addis Ababa, Ethiopia (ETAD)

---

## 1. Setup and Imports

In [None]:
# Add scripts folder to path
import sys
sys.path.insert(0, './scripts')

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from our modular scripts
from config import (
    SITES, PROCESSED_SITES_DIR, FILTER_DATA_PATH,
    MAC_VALUE, FLOW_FIX_PERIODS, MIN_EC_THRESHOLD,
    SMOOTH_RAW_THRESHOLDS, CROSS_COMPARISONS
)

from outliers import (
    EXCLUDED_SAMPLES, MANUAL_OUTLIERS,
    apply_exclusion_flags, apply_threshold_flags,
    get_clean_data, print_exclusion_summary,
    identify_outlier_dates
)

from data_matching import (
    load_aethalometer_data, load_filter_data,
    match_aeth_filter_data, match_all_parameters,
    match_with_smooth_raw_info, add_flow_period_column,
    get_site_code, get_site_color, print_data_summary
)

from plotting import (
    calculate_regression_stats,
    plot_crossplot, plot_before_after_comparison,
    create_tiled_threshold_plots, plot_smooth_raw_distribution,
    plot_bc_timeseries, plot_multiwavelength_bc,
    print_comparison_table
)

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Modules loaded successfully!")
print(f"\nSites configured: {list(SITES.keys())}")
print(f"MAC value: {MAC_VALUE} m^2/g")

## 2. Load Data

In [None]:
# Load all aethalometer datasets
aethalometer_data = load_aethalometer_data()

In [None]:
# Load filter dataset
filter_data = load_filter_data()

In [None]:
# Print data summary
print_data_summary(aethalometer_data, filter_data)

## 3. BC Time Series

In [None]:
# Combined BC time series - all sites
fig, ax = plt.subplots(figsize=(14, 7))

for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    plot_bc_timeseries(ax, site_name, df, config, wavelength='IR')

ax.set_title('IR Black Carbon Time Series - All Sites', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Individual site time series
for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    
    fig, ax = plt.subplots(figsize=(12, 5))
    plot_bc_timeseries(ax, site_name, df, config, wavelength='IR')
    
    ax.set_title(f'IR Black Carbon - {site_name}', fontsize=14, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 4. Aethalometer vs FTIR EC Cross-Plots

Compare aethalometer BC with filter-based FTIR EC measurements.

In [None]:
# Match data and create cross-plots for each site
all_matched_data = {}

for site_name in SITES:
    if site_name not in aethalometer_data:
        continue
    
    config = SITES[site_name]
    df_aeth = aethalometer_data[site_name]
    
    # Match aethalometer and filter data
    matched = match_aeth_filter_data(
        site_name, df_aeth, filter_data, config['code']
    )
    
    if matched is not None and len(matched) >= 3:
        all_matched_data[site_name] = matched
        print(f"{site_name}: {len(matched)} matched pairs")
    else:
        print(f"{site_name}: Insufficient matched data")

In [None]:
# Combined scatter plot - all sites
fig, ax = plt.subplots(figsize=(10, 10))

for site_name, matched_df in all_matched_data.items():
    config = SITES[site_name]
    ax.scatter(
        matched_df['aeth_bc'], matched_df['filter_ec'],
        color=config['color'], alpha=0.6, s=80,
        edgecolors='black', linewidth=1,
        label=f"{site_name} (n={len(matched_df)})"
    )

# Set axes to start from 0
ax.set_xlim(left=0)
ax.set_ylim(bottom=0)

# Add 1:1 line
max_val = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, linewidth=1.5, label='1:1 line')

ax.set_xlabel('Aethalometer IR BC (ng/m³)', fontsize=12)
ax.set_ylabel('FTIR EC (ng/m³)', fontsize=12)
ax.set_title('Aethalometer BC vs FTIR EC - All Sites', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Individual cross-plots with regression stats
site_stats = {}

for site_name, matched_df in all_matched_data.items():
    config = SITES[site_name]
    
    fig, ax = plt.subplots(figsize=(9, 9))
    
    stats = plot_crossplot(
        ax,
        matched_df['aeth_bc'].values,
        matched_df['filter_ec'].values,
        'Aethalometer IR BC (ng/m³)',
        'FTIR EC (ng/m³)',
        color=config['color'],
        equal_axes=True
    )
    
    ax.set_title(f'{site_name}: Aethalometer BC vs FTIR EC', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    if stats:
        site_stats[site_name] = stats
        print(f"{site_name}: R² = {stats['r_squared']:.3f}, Slope = {stats['slope']:.3f}\n")

## 5. Outlier Detection and Removal

Apply manual outlier exclusions and see the impact on regression statistics.

In [None]:
# Show current outlier configuration
print("Current EXCLUDED_SAMPLES registry:")
print("="*60)
for site, exclusions in EXCLUDED_SAMPLES.items():
    print(f"\n{site}: {len(exclusions)} exclusion(s)")
    for excl in exclusions:
        print(f"  - {excl['date']}: {excl['reason'][:50]}...")

In [None]:
# Find actual outlier dates from your data
# Run this to identify dates to add to EXCLUDED_SAMPLES in outliers.py

print("Finding outlier dates based on MANUAL_OUTLIERS thresholds...")
print("="*60)

for site_name, matched_df in all_matched_data.items():
    criteria = MANUAL_OUTLIERS.get(site_name, {}).get('remove_criteria', [])
    
    if not criteria:
        print(f"\n{site_name}: No thresholds defined")
        continue
    
    print(f"\n{site_name}:")
    for crit in criteria:
        identify_outlier_dates(site_name, matched_df, crit)

In [None]:
# Apply threshold-based outlier flagging and create before/after plots

for site_name, matched_df in all_matched_data.items():
    config = SITES[site_name]
    
    # Apply threshold-based outlier flags
    matched_with_flags = apply_threshold_flags(matched_df.copy(), site_name)
    
    # Print summary
    print_exclusion_summary(matched_with_flags, site_name)
    
    # Create before/after comparison plot
    if matched_with_flags['is_outlier'].any():
        fig, stats_all, stats_clean = plot_before_after_comparison(
            matched_with_flags,
            site_name,
            config['color'],
            outlier_col='is_outlier'
        )
        plt.show()
        
        if stats_all and stats_clean:
            print(f"\n  Impact of outlier removal:")
            print(f"    R²: {stats_all['r_squared']:.3f} → {stats_clean['r_squared']:.3f} "
                  f"(Δ = {stats_clean['r_squared'] - stats_all['r_squared']:+.3f})")
            print(f"    Slope: {stats_all['slope']:.3f} → {stats_clean['slope']:.3f}")
    else:
        print(f"  No outliers flagged for {site_name}")
    
    print()

## 6. Smooth vs Raw BC Analysis

Analyze the impact of smooth/raw BC differences on data quality.

In [None]:
# Match data with smooth/raw info
smooth_raw_data = {}

for site_name in SITES:
    if site_name not in aethalometer_data:
        continue
    
    config = SITES[site_name]
    df_aeth = aethalometer_data[site_name]
    
    matched = match_with_smooth_raw_info(
        site_name, df_aeth, filter_data, config['code']
    )
    
    if matched is not None and len(matched) >= 3:
        n_with_smooth = matched['smooth_raw_abs_pct'].notna().sum()
        smooth_raw_data[site_name] = matched
        print(f"{site_name}: {len(matched)} matched, {n_with_smooth} with smooth data")
    else:
        print(f"{site_name}: Insufficient data")

In [None]:
# Distribution plots for each site
for site_name, matched_df in smooth_raw_data.items():
    config = SITES[site_name]
    
    if matched_df['smooth_raw_abs_pct'].notna().sum() < 3:
        print(f"{site_name}: Insufficient smooth data")
        continue
    
    fig = plot_smooth_raw_distribution(
        matched_df, site_name, config['color'],
        thresholds=SMOOTH_RAW_THRESHOLDS
    )
    if fig:
        plt.show()

In [None]:
# Tiled threshold plots
threshold_results = {}

for site_name, matched_df in smooth_raw_data.items():
    config = SITES[site_name]
    
    if matched_df['smooth_raw_abs_pct'].notna().sum() < 3:
        continue
    
    fig, results = create_tiled_threshold_plots(
        matched_df, site_name, config['color'],
        thresholds=SMOOTH_RAW_THRESHOLDS
    )
    plt.show()
    
    threshold_results[site_name] = results

In [None]:
# Summary table
print_comparison_table(threshold_results, metric_name='Smooth/Raw Threshold Impact')

## 7. Cross-Comparisons: HIPS, FTIR EC, Iron, Aethalometer

Compare different measurement methods across all sites.

In [None]:
# Match all parameters for each site
all_params_data = {}

for site_name in SITES:
    if site_name not in aethalometer_data:
        continue
    
    config = SITES[site_name]
    df_aeth = aethalometer_data[site_name]
    
    matched = match_all_parameters(
        site_name, config['code'], df_aeth, filter_data
    )
    
    if matched is not None and len(matched) >= 3:
        all_params_data[site_name] = matched
        
        # Show available parameters
        available = [col for col in ['ir_bcc', 'hips_fabs', 'ftir_ec', 'iron'] 
                     if col in matched.columns and matched[col].notna().any()]
        print(f"{site_name}: {len(matched)} days, params: {', '.join(available)}")
    else:
        print(f"{site_name}: Insufficient data")

In [None]:
# Create cross-comparison plots
comparison_results = {}

for site_name, matched_df in all_params_data.items():
    config = SITES[site_name]
    site_results = {}
    
    print(f"\n{'='*60}")
    print(f"{site_name}")
    print(f"{'='*60}")
    
    for comp in CROSS_COMPARISONS:
        x_col = comp['x_col']
        y_col = comp['y_col']
        
        # Check if columns exist
        if x_col not in matched_df.columns or y_col not in matched_df.columns:
            continue
        
        x_data = matched_df[x_col].values
        y_data = matched_df[y_col].values
        
        # Check for valid data
        valid_count = ((~np.isnan(x_data)) & (~np.isnan(y_data))).sum()
        if valid_count < 3:
            continue
        
        fig, ax = plt.subplots(figsize=(9, 9))
        
        stats = plot_crossplot(
            ax, x_data, y_data,
            comp['x_label'], comp['y_label'],
            color=config['color'],
            equal_axes=comp['equal_axes'],
            show_mac=comp['show_mac']
        )
        
        ax.set_title(f"{site_name}: {comp['name']}", fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        if stats:
            site_results[comp['name']] = stats
            print(f"  {comp['name']}: R² = {stats['r_squared']:.3f}, n = {stats['n']}")
    
    comparison_results[site_name] = site_results

## 8. Summary Statistics

In [None]:
# Create summary table of all cross-comparisons
print("\n" + "="*100)
print("SUMMARY: CROSS-COMPARISON RESULTS")
print("="*100)
print(f"\nNote: HIPS Fabs divided by MAC = {MAC_VALUE} m²/g")

for comp in CROSS_COMPARISONS:
    comp_name = comp['name']
    print(f"\n{comp_name}:")
    print(f"{'Site':<15s} {'n':>8s} {'R²':>10s} {'Slope':>10s}")
    print("-" * 45)
    
    for site_name in SITES:
        if site_name in comparison_results and comp_name in comparison_results[site_name]:
            stats = comparison_results[site_name][comp_name]
            print(f"{site_name:<15s} {stats['n']:>8d} {stats['r_squared']:>10.3f} {stats['slope']:>10.3f}")
        else:
            print(f"{site_name:<15s} {'--':>8s} {'--':>10s} {'--':>10s}")

In [None]:
# BC statistics by site
print("\n" + "="*80)
print("BC CONCENTRATION STATISTICS BY SITE")
print("="*80)

for site_name, df in aethalometer_data.items():
    if 'IR BCc' in df.columns:
        bc_data = df['IR BCc'].dropna()
        
        print(f"\n{site_name}:")
        print(f"  n = {len(bc_data)}")
        print(f"  Mean: {bc_data.mean():.1f} ng/m³")
        print(f"  Median: {bc_data.median():.1f} ng/m³")
        print(f"  Std: {bc_data.std():.1f} ng/m³")
        print(f"  Range: {bc_data.min():.1f} - {bc_data.max():.1f} ng/m³")

---

## Notes

### Updating Outliers

To add new outliers:

1. Run `identify_outlier_dates()` to find exact dates
2. Edit `scripts/outliers.py` and add entries to `EXCLUDED_SAMPLES`
3. Restart kernel and re-run

### Configuration

Key settings are in `scripts/config.py`:
- `SITES` - Site definitions (codes, colors, files)
- `MAC_VALUE` - Mass Absorption Cross-section (default: 10)
- `MIN_EC_THRESHOLD` - Minimum EC to include (default: 0.5 µg/m³)
- `FLOW_FIX_PERIODS` - Before/after flow fix dates

### Module Structure

```
scripts/
├── config.py        - Central configuration
├── outliers.py      - Outlier registry and functions
├── data_matching.py - Data loading and matching
└── plotting.py      - Reusable plotting functions
```