# Addis Ababa: Dominant Source Determination Comparison

This notebook compares two methods for determining the dominant aerosol source:

1. **Fraction-based (GF columns)**: Uses raw PMF contribution fractions
2. **Concentration-based (K_F columns)**: Uses absolute source concentrations

## Key Difference from Previous Analysis:
The previous notebook normalized GF fractions so they sum to 1.0. This notebook uses
`idxmax()` directly on the raw columns to find whichever source has the highest value.

---

## Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib.dates import MonthLocator, DateFormatter
import warnings
warnings.filterwarnings('ignore')

# Add scripts folder to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

from config import SITES, MAC_VALUE
from data_matching import (
    load_aethalometer_data,
    load_filter_data,
    add_base_filter_id,
    match_all_parameters,
    load_etad_factors_with_filter_ids,
)
print("Loaded config and data_matching modules")

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 13

# Create output directories
def setup_directories():
    dirs = {
        'plots': 'output/plots/addis_ababa/dominant_source_comparison',
        'data': 'output/data/addis_ababa'
    }
    for dir_path in dirs.values():
        os.makedirs(dir_path, exist_ok=True)
    return dirs

dirs = setup_directories()
print("Setup complete!")
print(f"MAC value: {MAC_VALUE} m²/g")

## Configuration

In [None]:
# Site configuration
ADDIS_CONFIG = {
    'name': 'Addis_Ababa',
    'code': 'ETAD',
    'timezone': 'Africa/Addis_Ababa',
}

# Ethiopian seasons
SEASONS = {
    'Dry Season': [10, 11, 12, 1, 2],
    'Belg Rainy Season': [3, 4, 5],
    'Kiremt Rainy Season': [6, 7, 8, 9]
}
SEASONS_ORDER = ['Dry Season', 'Belg Rainy Season', 'Kiremt Rainy Season']
SEASON_COLORS = {'Dry Season': '#E67E22', 'Belg Rainy Season': '#27AE60', 'Kiremt Rainy Season': '#3498DB'}

# Source apportionment categories
SOURCE_CATEGORIES = {
    'charcoal': {'label': 'Charcoal Burning', 'color': '#2C3E50', 'marker': 'o'},
    'wood': {'label': 'Wood Burning', 'color': '#8B4513', 'marker': 's'},
    'fossil_fuel': {'label': 'Fossil Fuel', 'color': '#7D3C98', 'marker': '^'},
    'polluted_marine': {'label': 'Polluted Marine', 'color': '#2980B9', 'marker': 'D'},
    'sea_salt': {'label': 'Sea Salt', 'color': '#1ABC9C', 'marker': 'v'},
}
SOURCE_ORDER = ['charcoal', 'wood', 'fossil_fuel', 'polluted_marine', 'sea_salt']

# Column mappings for GF (fraction) and K_F (concentration) columns
GF_COLUMNS = {
    'GF3 (Charcoal)': 'charcoal',
    'GF2 (Wood Burning)': 'wood',
    'GF5 (Fossil Fuel Combustion)': 'fossil_fuel',
    'GF4 (Polluted Marine)': 'polluted_marine',
    'GF1 (Sea Salt Mixed)': 'sea_salt',
}

KF_COLUMNS = {
    'K_F3 Charcoal (ug/m3)': 'charcoal',
    'K_F2 Wood Burning (ug/m3)': 'wood',
    'K_F5 Fossil Fuel Combustion (ug/m3)': 'fossil_fuel',
    'K_F4 Polluted Marine (ug/m3)': 'polluted_marine',
    'K_F1 Sea Salt Mixed (ug/m3)': 'sea_salt',
}

# All method pairs to analyze: (x_col, y_col, x_label, y_label, file_prefix)
METHOD_PAIRS = [
    ('ftir_ec', 'hips_fabs', 'FTIR EC (µg/m³)', 'HIPS Fabs/MAC (µg/m³)', 'hips_vs_ec'),
    ('ftir_ec', 'ir_bcc', 'FTIR EC (µg/m³)', 'Aeth IR BCc (µg/m³)', 'aeth_ir_vs_ec'),
    ('hips_fabs', 'ir_bcc', 'HIPS Fabs/MAC (µg/m³)', 'Aeth IR BCc (µg/m³)', 'aeth_ir_vs_hips'),
]

# Dominant source methods to compare
DOMINANCE_METHODS = [
    ('dominant_source_frac', 'GF (Fraction)', 'GF'),
    ('dominant_source_conc', 'K_F (Concentration)', 'KF'),
]

# Thresholds for filtering
DOMINANCE_THRESHOLDS = [0.30, 0.40, 0.50, 0.60]

print(f"Site: {ADDIS_CONFIG['name']}")
print(f"Source categories: {', '.join(SOURCE_CATEGORIES.keys())}")
print(f"Method pairs to analyze: {len(METHOD_PAIRS)}")
for x_col, y_col, x_lab, y_lab, prefix in METHOD_PAIRS:
    print(f"  {y_lab} vs {x_lab}")
print(f"Dominance methods: GF (Fraction), K_F (Concentration)")

## Data Loading

Load factor contributions keeping both GF (fraction) and K_F (concentration) columns raw.

In [None]:
# =============================================================================
# Load factor contributions with Filter IDs (joined via oldDate)
# =============================================================================
factors_df = load_etad_factors_with_filter_ids()

# Check what columns are available
print("Available columns in factors_df:")
print(factors_df.columns.tolist())
print(f"\nTotal samples: {len(factors_df)}")

In [None]:
# =============================================================================
# Identify GF and K_F columns present in the data
# =============================================================================
gf_cols_present = [col for col in GF_COLUMNS.keys() if col in factors_df.columns]
kf_cols_present = [col for col in KF_COLUMNS.keys() if col in factors_df.columns]

print(f"GF columns found: {gf_cols_present}")
print(f"K_F columns found: {kf_cols_present}")

# Show summary of raw values
print("\n--- GF Column Statistics (Raw Fractions) ---")
for col in gf_cols_present:
    vals = factors_df[col].dropna()
    print(f"{col}: mean={vals.mean():.4f}, min={vals.min():.4f}, max={vals.max():.4f}")

print("\n--- K_F Column Statistics (Concentrations, µg/m³) ---")
for col in kf_cols_present:
    vals = factors_df[col].dropna()
    print(f"{col}: mean={vals.mean():.3f}, min={vals.min():.3f}, max={vals.max():.3f}")

In [None]:
# =============================================================================
# Method 1: Determine dominant source using GF columns (fraction-based)
# Uses idxmax() on raw GF values
# =============================================================================
if gf_cols_present:
    factors_df['dominant_gf'] = factors_df[gf_cols_present].idxmax(axis=1)
    factors_df['dominant_source_frac'] = factors_df['dominant_gf'].map(GF_COLUMNS)
    factors_df['dominant_gf_value'] = factors_df[gf_cols_present].max(axis=1)
    
    # Calculate relative fraction (for comparison with normalized approach)
    gf_sum = factors_df[gf_cols_present].sum(axis=1)
    factors_df['dominant_gf_relative'] = factors_df['dominant_gf_value'] / gf_sum
    
    print("\n=== Method 1: Dominant Source by GF (Fraction) ===")
    print("\nDominant source distribution (fraction-based):")
    print(factors_df['dominant_source_frac'].value_counts())
    print(f"\nDominant GF value stats: mean={factors_df['dominant_gf_value'].mean():.4f}, "
          f"min={factors_df['dominant_gf_value'].min():.4f}, max={factors_df['dominant_gf_value'].max():.4f}")
    print(f"Relative fraction stats: mean={factors_df['dominant_gf_relative'].mean():.1%}, "
          f"min={factors_df['dominant_gf_relative'].min():.1%}, max={factors_df['dominant_gf_relative'].max():.1%}")

In [None]:
# =============================================================================
# Method 2: Determine dominant source using K_F columns (concentration-based)
# Uses idxmax() on raw K_F values
# =============================================================================
if kf_cols_present:
    factors_df['dominant_kf'] = factors_df[kf_cols_present].idxmax(axis=1)
    factors_df['dominant_source_conc'] = factors_df['dominant_kf'].map(KF_COLUMNS)
    factors_df['dominant_kf_value'] = factors_df[kf_cols_present].max(axis=1)
    
    # Calculate relative concentration
    kf_sum = factors_df[kf_cols_present].sum(axis=1)
    factors_df['dominant_kf_relative'] = factors_df['dominant_kf_value'] / kf_sum
    
    print("\n=== Method 2: Dominant Source by K_F (Concentration) ===")
    print("\nDominant source distribution (concentration-based):")
    print(factors_df['dominant_source_conc'].value_counts())
    print(f"\nDominant K_F value stats: mean={factors_df['dominant_kf_value'].mean():.3f} µg/m³, "
          f"min={factors_df['dominant_kf_value'].min():.3f}, max={factors_df['dominant_kf_value'].max():.3f}")
    print(f"Relative concentration stats: mean={factors_df['dominant_kf_relative'].mean():.1%}, "
          f"min={factors_df['dominant_kf_relative'].min():.1%}, max={factors_df['dominant_kf_relative'].max():.1%}")

## Compare the Two Methods

In [None]:
# =============================================================================
# Compare GF vs K_F dominant source assignments
# =============================================================================
if 'dominant_source_frac' in factors_df.columns and 'dominant_source_conc' in factors_df.columns:
    # Agreement check
    agreement_mask = factors_df['dominant_source_frac'] == factors_df['dominant_source_conc']
    agreement_pct = agreement_mask.mean() * 100
    
    print("=" * 80)
    print("COMPARISON: GF (Fraction) vs K_F (Concentration) Dominant Source")
    print("=" * 80)
    print(f"\nAgreement rate: {agreement_pct:.1f}% ({agreement_mask.sum()}/{len(agreement_mask)} samples)")
    
    # Cross-tabulation
    print("\nCross-tabulation (rows=GF method, cols=K_F method):")
    crosstab = pd.crosstab(factors_df['dominant_source_frac'], 
                           factors_df['dominant_source_conc'],
                           margins=True)
    print(crosstab)
    
    # Show disagreements
    disagreements = factors_df[~agreement_mask][['date', 'dominant_source_frac', 
                                                  'dominant_source_conc', 
                                                  'dominant_gf_value', 'dominant_kf_value']].copy()
    if len(disagreements) > 0:
        print(f"\nDisagreements ({len(disagreements)} samples):")
        print(disagreements.head(20).to_string())

In [None]:
# =============================================================================
# Visualize method comparison
# =============================================================================
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Panel 1: GF-based dominant source distribution
ax = axes[0]
if 'dominant_source_frac' in factors_df.columns:
    counts_gf = factors_df['dominant_source_frac'].value_counts().reindex(SOURCE_ORDER, fill_value=0)
    colors = [SOURCE_CATEGORIES[s]['color'] for s in SOURCE_ORDER]
    bars = ax.bar(range(len(SOURCE_ORDER)), counts_gf.values, color=colors, edgecolor='black')
    ax.set_xticks(range(len(SOURCE_ORDER)))
    ax.set_xticklabels([SOURCE_CATEGORIES[s]['label'] for s in SOURCE_ORDER], rotation=45, ha='right')
    ax.set_ylabel('Sample Count')
    ax.set_title('Method 1: GF (Fraction) Based', fontweight='bold')
    for i, v in enumerate(counts_gf.values):
        ax.text(i, v + 1, str(v), ha='center', fontsize=10)

# Panel 2: K_F-based dominant source distribution
ax = axes[1]
if 'dominant_source_conc' in factors_df.columns:
    counts_kf = factors_df['dominant_source_conc'].value_counts().reindex(SOURCE_ORDER, fill_value=0)
    colors = [SOURCE_CATEGORIES[s]['color'] for s in SOURCE_ORDER]
    bars = ax.bar(range(len(SOURCE_ORDER)), counts_kf.values, color=colors, edgecolor='black')
    ax.set_xticks(range(len(SOURCE_ORDER)))
    ax.set_xticklabels([SOURCE_CATEGORIES[s]['label'] for s in SOURCE_ORDER], rotation=45, ha='right')
    ax.set_ylabel('Sample Count')
    ax.set_title('Method 2: K_F (Concentration) Based', fontweight='bold')
    for i, v in enumerate(counts_kf.values):
        ax.text(i, v + 1, str(v), ha='center', fontsize=10)

# Panel 3: Agreement heatmap
ax = axes[2]
if 'dominant_source_frac' in factors_df.columns and 'dominant_source_conc' in factors_df.columns:
    crosstab_ordered = pd.crosstab(factors_df['dominant_source_frac'], 
                                    factors_df['dominant_source_conc'])
    crosstab_ordered = crosstab_ordered.reindex(index=SOURCE_ORDER, columns=SOURCE_ORDER, fill_value=0)
    sns.heatmap(crosstab_ordered, annot=True, fmt='d', cmap='YlOrRd', ax=ax,
                xticklabels=[SOURCE_CATEGORIES[s]['label'][:8] for s in SOURCE_ORDER],
                yticklabels=[SOURCE_CATEGORIES[s]['label'][:8] for s in SOURCE_ORDER])
    ax.set_xlabel('K_F (Concentration) Method')
    ax.set_ylabel('GF (Fraction) Method')
    ax.set_title(f'Agreement Matrix\n({agreement_pct:.1f}% agreement)', fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(dirs['plots'], 'method_comparison_overview.png'), dpi=150, bbox_inches='tight')
plt.show()

## Merge with BC/EC Measurements

In [None]:
# =============================================================================
# Load aethalometer + filter measurements and match by date
# =============================================================================
aethalometer_data = load_aethalometer_data()
filter_data = load_filter_data()
filter_data = add_base_filter_id(filter_data)

df_aeth = aethalometer_data.get('Addis_Ababa')
bc_df = match_all_parameters('Addis_Ababa', 'ETAD', df_aeth, filter_data)

# Get the base_filter_id for each bc_df date
etad_filters = filter_data[filter_data['Site'] == 'ETAD'][['SampleDate', 'FilterId']].drop_duplicates()
etad_filters = etad_filters.rename(columns={'SampleDate': 'date', 'FilterId': 'base_filter_id'})
bc_df['date'] = pd.to_datetime(bc_df['date'])
etad_filters['date'] = pd.to_datetime(etad_filters['date'])

bc_with_id = pd.merge(bc_df, etad_filters, on='date', how='left')

# Merge with factor contributions
merge_cols = ['base_filter_id', 'dominant_source_frac', 'dominant_source_conc',
              'dominant_gf_value', 'dominant_kf_value', 
              'dominant_gf_relative', 'dominant_kf_relative']
merge_cols = [c for c in merge_cols if c in factors_df.columns]

df = pd.merge(bc_with_id, factors_df[merge_cols].drop_duplicates(),
              on='base_filter_id', how='inner')

# Add temporal features
df['Month'] = df['date'].dt.month
df['Ethiopian_Season'] = df['Month'].map(lambda m:
    'Dry Season' if m in SEASONS['Dry Season'] else
    'Belg Rainy Season' if m in SEASONS['Belg Rainy Season'] else
    'Kiremt Rainy Season'
)

print(f"\nFinal dataset: {len(df)} samples")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"\nBC/EC availability:")
for col in ['ftir_ec', 'hips_fabs', 'ir_bcc']:
    if col in df.columns:
        n = df[col].notna().sum()
        print(f"  {col}: {n} samples")

---

# Source-Separated Regressions: GF (Fraction) Method — All Method Pairs

In [None]:
def plot_regression(df, x_col, y_col, x_label, y_label, title, color_by=None,
                    color_dict=None, ax=None, show_stats=True):
    """
    Create a regression scatter plot with statistics.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 7))
    else:
        fig = ax.figure
    
    valid = df[[x_col, y_col]].dropna()
    if color_by and color_by in df.columns:
        valid = pd.merge(valid, df[[color_by]], left_index=True, right_index=True)
    
    if len(valid) < 3:
        ax.text(0.5, 0.5, f'Insufficient data\n(n={len(valid)})', 
                transform=ax.transAxes, ha='center', va='center', fontsize=14)
        ax.set_title(title)
        return fig, None
    
    x = valid[x_col].values
    y = valid[y_col].values
    
    # Plot points
    if color_by and color_by in valid.columns and color_dict:
        for category in valid[color_by].unique():
            if pd.isna(category):
                continue
            mask = valid[color_by] == category
            cat_info = color_dict.get(category, {'color': 'gray', 'label': category, 'marker': 'o'})
            ax.scatter(valid.loc[mask, x_col], valid.loc[mask, y_col],
                      s=60, alpha=0.7, color=cat_info.get('color', 'gray'),
                      marker=cat_info.get('marker', 'o'),
                      edgecolors='black', linewidth=0.3,
                      label=f"{cat_info.get('label', category)} (n={mask.sum()})")
    else:
        ax.scatter(x, y, s=60, alpha=0.6, color='#3498DB', edgecolors='black', linewidth=0.3)
    
    # Regression
    slope, intercept, r, p, se = stats.linregress(x, y)
    r_squared = r ** 2
    
    # Plot regression line and 1:1
    ax_max = max(x.max(), y.max()) * 1.1
    x_fit = np.linspace(0, ax_max, 100)
    ax.plot(x_fit, slope * x_fit + intercept, 'k-', linewidth=2, alpha=0.7, label='Regression')
    ax.plot([0, ax_max], [0, ax_max], 'k--', linewidth=1.5, alpha=0.4, label='1:1 line')
    
    # Statistics annotation
    if show_stats:
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
        stats_text = f'y = {slope:.3f}x + {intercept:.3f}\nR² = {r_squared:.3f} ({sig})\nn = {len(valid)}'
        ax.text(0.03, 0.97, stats_text, transform=ax.transAxes, fontsize=10, va='top',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
    
    # Formatting
    ax.set_xlim(0, ax_max)
    ax.set_ylim(0, ax_max)
    ax.set_xlabel(x_label, fontsize=12)
    ax.set_ylabel(y_label, fontsize=12)
    ax.set_title(title, fontsize=13, fontweight='bold')
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.3)
    
    if color_by:
        ax.legend(fontsize=8, loc='lower right')
    
    results = {
        'slope': slope, 'intercept': intercept, 'r': r, 'r_squared': r_squared,
        'p_value': p, 'se': se, 'n': len(valid)
    }
    
    return fig, results

In [None]:
def plot_source_separated_regressions(df, x_col, y_col, x_label, y_label, 
                                       dominant_col='dominant_source_frac',
                                       method_label='GF (Fraction)'):
    """
    Create panel of regression plots, one per dominant source.
    """
    n_sources = len(SOURCE_ORDER)
    n_cols = 3
    n_rows = int(np.ceil(n_sources / n_cols))
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 5.5*n_rows))
    axes = axes.flatten() if n_sources > 1 else [axes]
    
    results_all = {}
    
    for idx, source in enumerate(SOURCE_ORDER):
        ax = axes[idx]
        
        # Filter to dominant source
        source_mask = df[dominant_col] == source
        source_data = df[source_mask].copy()
        
        if len(source_data) < 3:
            ax.text(0.5, 0.5, f'{SOURCE_CATEGORIES[source]["label"]}\n(n={len(source_data)})\nInsufficient data',
                   transform=ax.transAxes, ha='center', va='center', fontsize=11)
            ax.set_title(SOURCE_CATEGORIES[source]['label'], fontsize=11, fontweight='bold',
                        color=SOURCE_CATEGORIES[source]['color'])
            ax.grid(True, alpha=0.3)
            continue
        
        valid = source_data[[x_col, y_col]].dropna()
        
        if len(valid) < 3:
            ax.text(0.5, 0.5, f'{SOURCE_CATEGORIES[source]["label"]}\n(n={len(valid)})\nInsufficient data',
                   transform=ax.transAxes, ha='center', va='center', fontsize=11)
            ax.set_title(SOURCE_CATEGORIES[source]['label'], fontsize=11, fontweight='bold',
                        color=SOURCE_CATEGORIES[source]['color'])
            ax.grid(True, alpha=0.3)
            continue
        
        x = valid[x_col].values
        y = valid[y_col].values
        
        # Scatter
        ax.scatter(x, y, s=50, alpha=0.6, color=SOURCE_CATEGORIES[source]['color'],
                  marker=SOURCE_CATEGORIES[source]['marker'], edgecolors='black', linewidth=0.3)
        
        # Regression
        slope, intercept, r, p, se = stats.linregress(x, y)
        
        ax_max = max(x.max(), y.max()) * 1.1 if len(x) > 0 else 10
        x_fit = np.linspace(0, ax_max, 100)
        ax.plot(x_fit, slope * x_fit + intercept, 'k-', linewidth=1.5, alpha=0.7)
        ax.plot([0, ax_max], [0, ax_max], 'k--', linewidth=1, alpha=0.3)
        
        # Stats
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns'
        ax.text(0.03, 0.97, f'y = {slope:.3f}x + {intercept:.2f}\nR² = {r**2:.3f} ({sig})\nn = {len(valid)}',
                transform=ax.transAxes, fontsize=9, va='top',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
        
        ax.set_xlim(0, ax_max)
        ax.set_ylim(0, ax_max)
        ax.set_xlabel(x_label, fontsize=10)
        ax.set_ylabel(y_label, fontsize=10)
        ax.set_title(SOURCE_CATEGORIES[source]['label'], fontsize=11, fontweight='bold',
                    color=SOURCE_CATEGORIES[source]['color'])
        ax.set_aspect('equal')
        ax.grid(True, alpha=0.3)
        
        results_all[source] = {
            'slope': slope, 'intercept': intercept, 'r': r, 'r_squared': r**2,
            'p_value': p, 'n': len(valid)
        }
    
    # Hide unused axes
    for idx in range(n_sources, len(axes)):
        axes[idx].set_visible(False)
    
    plt.suptitle(f'{y_label} vs {x_label} — By Dominant Source ({method_label} Method)',
                fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    
    return fig, results_all

In [None]:
print("=" * 80)
print("SOURCE-SEPARATED REGRESSIONS: GF (FRACTION) METHOD — ALL PAIRS")
print("=" * 80)

gf_all_results = {}

for x_col, y_col, x_label, y_label, prefix in METHOD_PAIRS:
    pair_label = f'{y_label.split(" (")[0]} vs {x_label.split(" (")[0]}'
    print(f"\n{'='*60}")
    print(f"  {pair_label} — GF (Fraction) Method")
    print(f"{'='*60}")
    
    fig, source_results = plot_source_separated_regressions(
        df, x_col, y_col, x_label, y_label,
        dominant_col='dominant_source_frac',
        method_label='GF (Fraction)'
    )
    plt.savefig(os.path.join(dirs['plots'], f'{prefix}_by_source_GF_method.png'), dpi=150, bbox_inches='tight')
    plt.show()
    
    gf_all_results[prefix] = source_results
    
    # Summary table
    print(f"\n{'Source':<20s} {'n':>5s} {'Slope':>8s} {'Intercept':>10s} {'R²':>8s} {'p-value':>12s}")
    print("-" * 70)
    for source in SOURCE_ORDER:
        if source in source_results:
            r = source_results[source]
            sig = '*' if r['p_value'] < 0.05 else ''
            print(f"{SOURCE_CATEGORIES[source]['label']:<20s} {r['n']:>5d} {r['slope']:>8.3f} "
                  f"{r['intercept']:>10.3f} {r['r_squared']:>8.3f} {r['p_value']:>11.2e}{sig}")

---

# Source-Separated Regressions: K_F (Concentration) Method — All Method Pairs

In [None]:
print("=" * 80)
print("SOURCE-SEPARATED REGRESSIONS: K_F (CONCENTRATION) METHOD — ALL PAIRS")
print("=" * 80)

kf_all_results = {}

for x_col, y_col, x_label, y_label, prefix in METHOD_PAIRS:
    pair_label = f'{y_label.split(" (")[0]} vs {x_label.split(" (")[0]}'
    print(f"\n{'='*60}")
    print(f"  {pair_label} — K_F (Concentration) Method")
    print(f"{'='*60}")
    
    fig, source_results = plot_source_separated_regressions(
        df, x_col, y_col, x_label, y_label,
        dominant_col='dominant_source_conc',
        method_label='K_F (Concentration)'
    )
    plt.savefig(os.path.join(dirs['plots'], f'{prefix}_by_source_KF_method.png'), dpi=150, bbox_inches='tight')
    plt.show()
    
    kf_all_results[prefix] = source_results
    
    # Summary table
    print(f"\n{'Source':<20s} {'n':>5s} {'Slope':>8s} {'Intercept':>10s} {'R²':>8s} {'p-value':>12s}")
    print("-" * 70)
    for source in SOURCE_ORDER:
        if source in source_results:
            r = source_results[source]
            sig = '*' if r['p_value'] < 0.05 else ''
            print(f"{SOURCE_CATEGORIES[source]['label']:<20s} {r['n']:>5d} {r['slope']:>8.3f} "
                  f"{r['intercept']:>10.3f} {r['r_squared']:>8.3f} {r['p_value']:>11.2e}{sig}")

---

# Side-by-Side Comparison: GF vs K_F for All Method Pairs

In [None]:
# =============================================================================
# Create side-by-side comparison plots for ALL method pairs
# =============================================================================
for x_col, y_col, x_label, y_label, prefix in METHOD_PAIRS:
    pair_label = f'{y_label.split(" (")[0]} vs {x_label.split(" (")[0]}'
    
    fig, axes = plt.subplots(2, len(SOURCE_ORDER), figsize=(5*len(SOURCE_ORDER), 10))
    
    for col_idx, source in enumerate(SOURCE_ORDER):
        for row_idx, (method_col, method_name) in enumerate([('dominant_source_frac', 'GF Method'),
                                                              ('dominant_source_conc', 'K_F Method')]):
            ax = axes[row_idx, col_idx]
            
            source_mask = df[method_col] == source
            source_data = df[source_mask]
            valid = source_data[[x_col, y_col]].dropna()
            
            if len(valid) < 3:
                ax.text(0.5, 0.5, f'n={len(valid)}\nInsufficient',
                       transform=ax.transAxes, ha='center', va='center', fontsize=10)
                ax.set_title(f'{SOURCE_CATEGORIES[source]["label"][:12]}\n({method_name})', fontsize=9)
                ax.grid(True, alpha=0.3)
                continue
            
            x, y = valid[x_col].values, valid[y_col].values
            
            ax.scatter(x, y, s=30, alpha=0.6, color=SOURCE_CATEGORIES[source]['color'],
                      edgecolors='black', linewidth=0.2)
            
            slope, intercept, r, p, se = stats.linregress(x, y)
            ax_max = max(x.max(), y.max()) * 1.1
            x_fit = np.linspace(0, ax_max, 50)
            ax.plot(x_fit, slope * x_fit + intercept, 'k-', linewidth=1.5, alpha=0.7)
            ax.plot([0, ax_max], [0, ax_max], 'k--', linewidth=1, alpha=0.3)
            
            ax.text(0.03, 0.97, f'R²={r**2:.3f}\nn={len(valid)}',
                    transform=ax.transAxes, fontsize=8, va='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            
            ax.set_xlim(0, ax_max)
            ax.set_ylim(0, ax_max)
            ax.set_aspect('equal')
            ax.grid(True, alpha=0.3)
            
            if col_idx == 0:
                ax.set_ylabel(y_label, fontsize=9)
            if row_idx == 1:
                ax.set_xlabel(x_label, fontsize=9)
            
            ax.set_title(f'{SOURCE_CATEGORIES[source]["label"][:12]}\n({method_name})', fontsize=9,
                        color=SOURCE_CATEGORIES[source]['color'])
    
    plt.suptitle(f'{pair_label}: Comparison of Dominant Source Methods', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(os.path.join(dirs['plots'], f'method_comparison_{prefix}.png'), dpi=150, bbox_inches='tight')
    plt.show()

---

# Summary and Export

In [None]:
# =============================================================================
# Summary comparison table — ALL method pairs
# =============================================================================
print("\n" + "=" * 100)
print("SUMMARY: GF vs K_F METHOD COMPARISON — ALL METHOD PAIRS")
print("=" * 100)

for x_col, y_col, x_label, y_label, prefix in METHOD_PAIRS:
    pair_label = f'{y_label.split(" (")[0]} vs {x_label.split(" (")[0]}'
    gf_r = gf_all_results.get(prefix, {})
    kf_r = kf_all_results.get(prefix, {})
    
    print(f"\n--- {pair_label} ---")
    print("{:<20s} | {:^20s} | {:^20s} | {:^20s}".format(
        'Source', 'n (GF / K_F)', 'R² (GF / K_F)', 'Slope (GF / K_F)'))
    print("-" * 90)
    
    for source in SOURCE_ORDER:
        gf_s = gf_r.get(source, {})
        kf_s = kf_r.get(source, {})
        
        n_str = f"{gf_s.get('n', 'N/A'):>4} / {kf_s.get('n', 'N/A'):<4}" if gf_s or kf_s else "N/A"
        r2_str = f"{gf_s.get('r_squared', 0):.3f} / {kf_s.get('r_squared', 0):.3f}" if gf_s and kf_s else "N/A"
        slope_str = f"{gf_s.get('slope', 0):.3f} / {kf_s.get('slope', 0):.3f}" if gf_s and kf_s else "N/A"
        
        print("{:<20s} | {:^20s} | {:^20s} | {:^20s}".format(
            SOURCE_CATEGORIES[source]['label'][:20], n_str, r2_str, slope_str))

# =============================================================================
# Export
# =============================================================================

# Save merged dataset with both dominant source columns
output_path = os.path.join(dirs['data'], 'bc_ec_dual_dominant_source.csv')
df.to_csv(output_path, index=False)
print(f"\nSaved merged dataset to: {output_path}")

# Save comparison results for all method pairs
all_comparison_data = []
for x_col, y_col, x_label, y_label, prefix in METHOD_PAIRS:
    pair_label = f'{y_label.split(" (")[0]} vs {x_label.split(" (")[0]}'
    gf_r = gf_all_results.get(prefix, {})
    kf_r = kf_all_results.get(prefix, {})
    
    for source in SOURCE_ORDER:
        gf_s = gf_r.get(source, {})
        kf_s = kf_r.get(source, {})
        all_comparison_data.append({
            'Method_Pair': pair_label,
            'Prefix': prefix,
            'Source': SOURCE_CATEGORIES[source]['label'],
            'n_GF': gf_s.get('n', None),
            'n_KF': kf_s.get('n', None),
            'R2_GF': gf_s.get('r_squared', None),
            'R2_KF': kf_s.get('r_squared', None),
            'Slope_GF': gf_s.get('slope', None),
            'Slope_KF': kf_s.get('slope', None),
            'pvalue_GF': gf_s.get('p_value', None),
            'pvalue_KF': kf_s.get('p_value', None),
        })

comparison_df = pd.DataFrame(all_comparison_data)
comparison_path = os.path.join(dirs['data'], 'dominant_source_method_comparison.csv')
comparison_df.to_csv(comparison_path, index=False)
print(f"Saved comparison results to: {comparison_path}")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
print(f"\nPlots saved to: {dirs['plots']}")
print(f"Data saved to: {dirs['data']}")