# Threshold-Filtered Regressions by Source Type

For each dominant source (Charcoal, Wood Burning, Fossil Fuel, Polluted Marine, Sea Salt),
this notebook shows how applying increasing dominance thresholds affects the regression
between BC/EC measurement methods.

**Both dominance methods are compared side-by-side:**
- **GF Method**: Normalized PMF mass fractions (sum to 1.0)
- **K_F Method**: Raw PMF concentrations (µg/m³)

**Measurement pairs analyzed:**
1. HIPS Fabs/MAC vs FTIR EC
2. Aeth IR BCc vs FTIR EC
3. Aeth IR BCc vs HIPS Fabs/MAC

---

## Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Add scripts folder to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

from config import SITES, MAC_VALUE
from data_matching import (
    load_aethalometer_data,
    load_filter_data,
    add_base_filter_id,
    match_all_parameters,
    load_etad_factors_with_filter_ids,
)
print("Loaded config and data_matching modules")

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 13

# Create output directories
def setup_directories():
    dirs = {
        'plots': 'output/plots/addis_ababa/threshold_by_source',
        'data': 'output/data/addis_ababa'
    }
    for dir_path in dirs.values():
        os.makedirs(dir_path, exist_ok=True)
    return dirs

dirs = setup_directories()
print("Setup complete!")

## Configuration

In [None]:
# Source apportionment categories
SOURCE_CATEGORIES = {
    'charcoal': {'label': 'Charcoal Burning', 'color': '#2C3E50', 'marker': 'o'},
    'wood': {'label': 'Wood Burning', 'color': '#8B4513', 'marker': 's'},
    'fossil_fuel': {'label': 'Fossil Fuel', 'color': '#7D3C98', 'marker': '^'},
    'polluted_marine': {'label': 'Polluted Marine', 'color': '#2980B9', 'marker': 'D'},
    'sea_salt': {'label': 'Sea Salt', 'color': '#1ABC9C', 'marker': 'v'},
}
SOURCE_ORDER = ['charcoal', 'wood', 'fossil_fuel', 'polluted_marine', 'sea_salt']

# Column mappings
GF_COLUMNS = {
    'GF3 (Charcoal)': 'charcoal',
    'GF2 (Wood Burning)': 'wood',
    'GF5 (Fossil Fuel Combustion)': 'fossil_fuel',
    'GF4 (Polluted Marine)': 'polluted_marine',
    'GF1 (Sea Salt Mixed)': 'sea_salt',
}

KF_COLUMNS = {
    'K_F3 Charcoal (ug/m3)': 'charcoal',
    'K_F2 Wood Burning (ug/m3)': 'wood',
    'K_F5 Fossil Fuel Combustion (ug/m3)': 'fossil_fuel',
    'K_F4 Polluted Marine (ug/m3)': 'polluted_marine',
    'K_F1 Sea Salt Mixed (ug/m3)': 'sea_salt',
}

# Thresholds to test
DOMINANCE_THRESHOLDS = [0.30, 0.40, 0.50, 0.60]

# Measurement pairs to analyze
MEASUREMENT_PAIRS = [
    ('ftir_ec', 'hips_fabs', 'FTIR EC (\u00b5g/m\u00b3)', 'HIPS Fabs/MAC (\u00b5g/m\u00b3)', 'hips_vs_ec'),
    ('ftir_ec', 'ir_bcc', 'FTIR EC (\u00b5g/m\u00b3)', 'Aeth IR BCc (\u00b5g/m\u00b3)', 'aeth_vs_ec'),
    ('hips_fabs', 'ir_bcc', 'HIPS Fabs/MAC (\u00b5g/m\u00b3)', 'Aeth IR BCc (\u00b5g/m\u00b3)', 'aeth_vs_hips'),
]

print(f"Source categories: {', '.join(SOURCE_CATEGORIES.keys())}")
print(f"Thresholds: {DOMINANCE_THRESHOLDS}")

## Data Loading

Load factor contributions and prepare both GF (normalized fraction) and K_F (concentration) dominance methods.

In [None]:
# =============================================================================
# Load factor contributions
# =============================================================================
factors_df = load_etad_factors_with_filter_ids()

gf_cols = [col for col in GF_COLUMNS.keys() if col in factors_df.columns]
kf_cols = [col for col in KF_COLUMNS.keys() if col in factors_df.columns]

print(f"GF columns found: {len(gf_cols)}")
print(f"K_F columns found: {len(kf_cols)}")

# --- GF Method: Normalize fractions to sum to 1.0 ---
gf_sum = factors_df[gf_cols].sum(axis=1)
for col in gf_cols:
    factors_df[f'{col}_norm'] = factors_df[col] / gf_sum

gf_norm_cols = [f'{col}_norm' for col in gf_cols]
gf_norm_map = {f'{col}_norm': GF_COLUMNS[col] for col in gf_cols}

factors_df['dominant_source_gf'] = factors_df[gf_norm_cols].idxmax(axis=1).map(gf_norm_map)
factors_df['dominant_fraction_gf'] = factors_df[gf_norm_cols].max(axis=1)

print("\n=== GF Method (Normalized Fractions) ===")
print(f"Dominant fraction stats: mean={factors_df['dominant_fraction_gf'].mean():.1%}, "
      f"min={factors_df['dominant_fraction_gf'].min():.1%}, max={factors_df['dominant_fraction_gf'].max():.1%}")
print(factors_df['dominant_source_gf'].value_counts())

# --- K_F Method: Normalize concentrations to relative fractions ---
kf_sum = factors_df[kf_cols].sum(axis=1)
for col in kf_cols:
    factors_df[f'{col}_norm'] = factors_df[col] / kf_sum

kf_norm_cols = [f'{col}_norm' for col in kf_cols]
kf_norm_map = {f'{col}_norm': KF_COLUMNS[col] for col in kf_cols}

factors_df['dominant_source_kf'] = factors_df[kf_norm_cols].idxmax(axis=1).map(kf_norm_map)
factors_df['dominant_fraction_kf'] = factors_df[kf_norm_cols].max(axis=1)

print("\n=== K_F Method (Normalized Concentrations) ===")
print(f"Dominant fraction stats: mean={factors_df['dominant_fraction_kf'].mean():.1%}, "
      f"min={factors_df['dominant_fraction_kf'].min():.1%}, max={factors_df['dominant_fraction_kf'].max():.1%}")
print(factors_df['dominant_source_kf'].value_counts())

In [None]:
# =============================================================================
# Load BC/EC measurements and merge with factor contributions
# =============================================================================
aethalometer_data = load_aethalometer_data()
filter_data = load_filter_data()
filter_data = add_base_filter_id(filter_data)

df_aeth = aethalometer_data.get('Addis_Ababa')
bc_df = match_all_parameters('Addis_Ababa', 'ETAD', df_aeth, filter_data)

# Get base_filter_id for merge
etad_filters = filter_data[filter_data['Site'] == 'ETAD'][['SampleDate', 'FilterId']].drop_duplicates()
etad_filters = etad_filters.rename(columns={'SampleDate': 'date', 'FilterId': 'base_filter_id'})
bc_df['date'] = pd.to_datetime(bc_df['date'])
etad_filters['date'] = pd.to_datetime(etad_filters['date'])

bc_with_id = pd.merge(bc_df, etad_filters, on='date', how='left')

# Merge with factor contributions
merge_cols = ['base_filter_id', 'dominant_source_gf', 'dominant_fraction_gf',
              'dominant_source_kf', 'dominant_fraction_kf']

df = pd.merge(bc_with_id, factors_df[merge_cols].drop_duplicates(),
              on='base_filter_id', how='inner')

print(f"Final dataset: {len(df)} samples")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"\nBC/EC availability:")
for col in ['ftir_ec', 'hips_fabs', 'ir_bcc']:
    if col in df.columns:
        n = df[col].notna().sum()
        print(f"  {col}: {n} samples")

## Threshold Sample Count Tables

How many samples remain at each threshold, for each source and method?

In [None]:
for method_name, src_col, frac_col in [('GF (Fraction)', 'dominant_source_gf', 'dominant_fraction_gf'),
                                        ('K_F (Concentration)', 'dominant_source_kf', 'dominant_fraction_kf')]:
    print(f"\n{'=' * 80}")
    print(f"Threshold Filtering: {method_name} Method")
    print(f"{'=' * 80}")
    print(f"{'Threshold':<12s}", end='')
    for source in SOURCE_ORDER:
        print(f" {SOURCE_CATEGORIES[source]['label'][:10]:>10s}", end='')
    print(f" {'Total':>10s}")
    print("-" * 80)
    
    # No threshold
    print(f"{'All':<12s}", end='')
    for source in SOURCE_ORDER:
        n = (df[src_col] == source).sum()
        print(f" {n:>10d}", end='')
    print(f" {len(df):>10d}")
    
    for thresh in DOMINANCE_THRESHOLDS:
        filtered = df[df[frac_col] >= thresh]
        print(f"\u2265{thresh*100:.0f}%{'':<8s}", end='')
        for source in SOURCE_ORDER:
            n = (filtered[src_col] == source).sum()
            print(f" {n:>10d}", end='')
        print(f" {len(filtered):>10d}")

---

## Core Plotting Function

For each source, plot a 2-row (GF / K_F) grid across thresholds.

In [None]:
def plot_source_threshold_comparison(df, source, x_col, y_col, x_label, y_label):
    """
    For a single source, create a 2-row x (n_thresholds+1)-col grid.
    Row 0: GF method at [All, 30%, 40%, 50%, 60%]
    Row 1: K_F method at [All, 30%, 40%, 50%, 60%]
    
    Returns (fig, results_dict)
    """
    thresholds = [None] + DOMINANCE_THRESHOLDS  # None = no threshold
    n_cols = len(thresholds)
    source_info = SOURCE_CATEGORIES[source]
    
    fig, axes = plt.subplots(2, n_cols, figsize=(4.5 * n_cols, 9))
    
    methods = [
        ('GF (Fraction)', 'dominant_source_gf', 'dominant_fraction_gf'),
        ('K_F (Concentration)', 'dominant_source_kf', 'dominant_fraction_kf'),
    ]
    
    results = {}
    
    for row_idx, (method_name, src_col, frac_col) in enumerate(methods):
        results[method_name] = {}
        
        for col_idx, thresh in enumerate(thresholds):
            ax = axes[row_idx, col_idx]
            
            # Filter to this source
            source_data = df[df[src_col] == source].copy()
            
            # Apply threshold
            if thresh is not None:
                source_data = source_data[source_data[frac_col] >= thresh]
            
            valid = source_data[[x_col, y_col]].dropna()
            thresh_label = 'All' if thresh is None else f'\u2265{thresh*100:.0f}%'
            
            if len(valid) < 3:
                ax.text(0.5, 0.5, f'n={len(valid)}\nInsufficient',
                       transform=ax.transAxes, ha='center', va='center', fontsize=10)
                ax.set_title(f'{thresh_label}\n({method_name})', fontsize=9)
                ax.grid(True, alpha=0.3)
                if col_idx == 0:
                    ax.set_ylabel(y_label, fontsize=9)
                if row_idx == 1:
                    ax.set_xlabel(x_label, fontsize=9)
                continue
            
            x = valid[x_col].values
            y = valid[y_col].values
            
            ax.scatter(x, y, s=35, alpha=0.6, color=source_info['color'],
                      marker=source_info['marker'], edgecolors='black', linewidth=0.2)
            
            slope, intercept, r, p, se = stats.linregress(x, y)
            ax_max = max(x.max(), y.max()) * 1.1
            x_fit = np.linspace(0, ax_max, 50)
            ax.plot(x_fit, slope * x_fit + intercept, 'k-', linewidth=1.5, alpha=0.7)
            ax.plot([0, ax_max], [0, ax_max], 'k--', linewidth=1, alpha=0.3)
            
            sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
            ax.text(0.03, 0.97, f'y={slope:.2f}x+{intercept:.2f}\nR\u00b2={r**2:.3f} ({sig})\nn={len(valid)}',
                    transform=ax.transAxes, fontsize=8, va='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
            
            ax.set_xlim(0, ax_max)
            ax.set_ylim(0, ax_max)
            ax.set_aspect('equal')
            ax.grid(True, alpha=0.3)
            
            if col_idx == 0:
                ax.set_ylabel(y_label, fontsize=9)
            if row_idx == 1:
                ax.set_xlabel(x_label, fontsize=9)
            
            ax.set_title(f'{thresh_label}\n({method_name})', fontsize=9)
            
            results[method_name][thresh_label] = {
                'r_squared': r**2, 'slope': slope, 'intercept': intercept,
                'p_value': p, 'n': len(valid)
            }
    
    fig.suptitle(f'{source_info["label"]} — {y_label} vs {x_label}\nThreshold Comparison: GF vs K_F Method',
                fontsize=13, fontweight='bold', y=1.03)
    plt.tight_layout()
    
    return fig, results


def print_source_results_table(source, results):
    """Print a summary table for one source's threshold results."""
    print(f"\n--- {SOURCE_CATEGORIES[source]['label']} ---")
    print(f"{'Threshold':<12s} {'n (GF)':>8s} {'R\u00b2 (GF)':>10s} {'Slope (GF)':>12s} | "
          f"{'n (K_F)':>8s} {'R\u00b2 (K_F)':>10s} {'Slope (K_F)':>12s}")
    print("-" * 80)
    
    gf = results.get('GF (Fraction)', {})
    kf = results.get('K_F (Concentration)', {})
    
    for key in ['All'] + [f'\u2265{int(t*100)}%' for t in DOMINANCE_THRESHOLDS]:
        gf_r = gf.get(key, {})
        kf_r = kf.get(key, {})
        
        gf_n = f"{gf_r['n']}" if gf_r else '-'
        gf_r2 = f"{gf_r['r_squared']:.3f}" if gf_r else '-'
        gf_sl = f"{gf_r['slope']:.3f}" if gf_r else '-'
        kf_n = f"{kf_r['n']}" if kf_r else '-'
        kf_r2 = f"{kf_r['r_squared']:.3f}" if kf_r else '-'
        kf_sl = f"{kf_r['slope']:.3f}" if kf_r else '-'
        
        print(f"{key:<12s} {gf_n:>8s} {gf_r2:>10s} {gf_sl:>12s} | "
              f"{kf_n:>8s} {kf_r2:>10s} {kf_sl:>12s}")


print("Functions defined.")

---

# 1. HIPS Fabs/MAC vs FTIR EC — By Source & Threshold

In [None]:
print("=" * 80)
print("HIPS vs FTIR EC — Per-Source Threshold Comparison")
print("=" * 80)

hips_ec_results = {}

for source in SOURCE_ORDER:
    fig, results = plot_source_threshold_comparison(
        df, source, 'ftir_ec', 'hips_fabs',
        'FTIR EC (\u00b5g/m\u00b3)', 'HIPS Fabs/MAC (\u00b5g/m\u00b3)'
    )
    plt.savefig(os.path.join(dirs['plots'], f'hips_vs_ec_{source}_thresholds.png'),
                dpi=150, bbox_inches='tight')
    plt.show()
    
    hips_ec_results[source] = results
    print_source_results_table(source, results)

---

# 2. Aeth IR BCc vs FTIR EC — By Source & Threshold

In [None]:
print("=" * 80)
print("Aeth IR BCc vs FTIR EC — Per-Source Threshold Comparison")
print("=" * 80)

aeth_ec_results = {}

for source in SOURCE_ORDER:
    fig, results = plot_source_threshold_comparison(
        df, source, 'ftir_ec', 'ir_bcc',
        'FTIR EC (\u00b5g/m\u00b3)', 'Aeth IR BCc (\u00b5g/m\u00b3)'
    )
    plt.savefig(os.path.join(dirs['plots'], f'aeth_vs_ec_{source}_thresholds.png'),
                dpi=150, bbox_inches='tight')
    plt.show()
    
    aeth_ec_results[source] = results
    print_source_results_table(source, results)

---

# 3. Aeth IR BCc vs HIPS — By Source & Threshold

In [None]:
print("=" * 80)
print("Aeth IR BCc vs HIPS — Per-Source Threshold Comparison")
print("=" * 80)

aeth_hips_results = {}

for source in SOURCE_ORDER:
    fig, results = plot_source_threshold_comparison(
        df, source, 'hips_fabs', 'ir_bcc',
        'HIPS Fabs/MAC (\u00b5g/m\u00b3)', 'Aeth IR BCc (\u00b5g/m\u00b3)'
    )
    plt.savefig(os.path.join(dirs['plots'], f'aeth_vs_hips_{source}_thresholds.png'),
                dpi=150, bbox_inches='tight')
    plt.show()
    
    aeth_hips_results[source] = results
    print_source_results_table(source, results)

---

# R² Summary: All Sources, All Pairs, Both Methods

In [None]:
all_results = {
    'HIPS vs EC': hips_ec_results,
    'Aeth vs EC': aeth_ec_results,
    'Aeth vs HIPS': aeth_hips_results,
}

# Build summary DataFrame
summary_rows = []
for pair_name, pair_results in all_results.items():
    for source in SOURCE_ORDER:
        src_results = pair_results.get(source, {})
        for method_name in ['GF (Fraction)', 'K_F (Concentration)']:
            method_results = src_results.get(method_name, {})
            for thresh_key, stats_dict in method_results.items():
                summary_rows.append({
                    'Pair': pair_name,
                    'Source': SOURCE_CATEGORIES[source]['label'],
                    'Method': method_name,
                    'Threshold': thresh_key,
                    'n': stats_dict['n'],
                    'R_squared': stats_dict['r_squared'],
                    'Slope': stats_dict['slope'],
                    'p_value': stats_dict['p_value'],
                })

summary_df = pd.DataFrame(summary_rows)

print("\n" + "=" * 100)
print("R\u00b2 SUMMARY TABLE")
print("=" * 100)

for pair_name in all_results.keys():
    print(f"\n--- {pair_name} ---")
    pair_data = summary_df[summary_df['Pair'] == pair_name]
    pivot = pair_data.pivot_table(
        index=['Source', 'Method'],
        columns='Threshold',
        values='R_squared'
    )
    # Reorder columns
    col_order = ['All'] + [f'\u2265{int(t*100)}%' for t in DOMINANCE_THRESHOLDS]
    pivot = pivot.reindex(columns=[c for c in col_order if c in pivot.columns])
    print(pivot.round(3).to_string())

---

# R² Trend Visualization

How does R² change with increasing threshold, for each source and method?

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

thresh_labels = ['All'] + [f'\u2265{int(t*100)}%' for t in DOMINANCE_THRESHOLDS]
x_pos = range(len(thresh_labels))

for ax_idx, (pair_name, pair_results) in enumerate(all_results.items()):
    ax = axes[ax_idx]
    
    for source in SOURCE_ORDER:
        src_results = pair_results.get(source, {})
        
        for method_name, linestyle in [('GF (Fraction)', '-'), ('K_F (Concentration)', '--')]:
            method_results = src_results.get(method_name, {})
            r2_values = []
            valid_x = []
            for i, key in enumerate(thresh_labels):
                if key in method_results:
                    r2_values.append(method_results[key]['r_squared'])
                    valid_x.append(i)
            
            if r2_values:
                method_short = 'GF' if 'GF' in method_name else 'K_F'
                ax.plot(valid_x, r2_values, linestyle=linestyle,
                       color=SOURCE_CATEGORIES[source]['color'],
                       marker=SOURCE_CATEGORIES[source]['marker'],
                       markersize=6, linewidth=1.5, alpha=0.8,
                       label=f"{SOURCE_CATEGORIES[source]['label'][:8]} ({method_short})")
    
    ax.set_xticks(list(x_pos))
    ax.set_xticklabels(thresh_labels, fontsize=9)
    ax.set_xlabel('Dominance Threshold', fontsize=11)
    ax.set_ylabel('R\u00b2', fontsize=11)
    ax.set_title(pair_name, fontsize=12, fontweight='bold')
    ax.set_ylim(0, 1.05)
    ax.grid(True, alpha=0.3)
    ax.legend(fontsize=6, loc='lower left', ncol=2)

plt.suptitle('R\u00b2 vs Dominance Threshold — By Source & Method\n(solid = GF, dashed = K_F)',
            fontsize=14, fontweight='bold', y=1.04)
plt.tight_layout()
plt.savefig(os.path.join(dirs['plots'], 'r2_vs_threshold_all.png'), dpi=150, bbox_inches='tight')
plt.show()

---

# Export Results

In [None]:
# Save summary table
output_path = os.path.join(dirs['data'], 'threshold_by_source_summary.csv')
summary_df.to_csv(output_path, index=False)
print(f"Saved summary to: {output_path}")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
print(f"\nPlots saved to: {dirs['plots']}")
print(f"Data saved to: {dirs['data']}")