# ETAD Factor Contributions Analysis - Ethiopia (Addis Ababa)

This notebook matches PMF factor contributions from the ETAD Factor Contributions CSV
to the unified filter dataset for Ethiopia, using date-based matching.

## PMF Source Fractions (Naveed's Analysis):
| Factor | Source | GF Column (daily fraction) | K_F Column (daily ug/m3) |
|--------|--------|---------------------------|-------------------------|
| F1 | Sea Salt Mixed | GF1 | K_F1 |
| F2 | Wood Burning | GF2 | K_F2 |
| F3 | Charcoal | GF3 | K_F3 |
| F4 | Polluted Marine | GF4 | K_F4 |
| F5 | Fossil Fuel Combustion | GF5 | K_F5 |

## Analysis Steps:
1. **Load ETAD factor contributions** and inspect the data
2. **Load filter data** and filter to ETAD site
3. **Filter sample coverage** - all ETAD samples vs PMF-analyzed subset
4. **Match by date** using the standard `match_etad_factors` function
5. **Explore source contributions** across matched filter parameters
6. **Visualize source trends** over time and against key parameters

---

## Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Add scripts folder to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

# Core imports from modular scripts
from config import SITES, MAC_VALUE, ETAD_FACTOR_CONTRIBUTIONS_PATH
from data_matching import (
    load_aethalometer_data,
    load_filter_data,
    add_base_filter_id,
    match_all_parameters,
    pivot_filter_by_id,
    load_etad_factor_contributions,
    match_etad_factors,
    ETAD_PMF_SOURCE_NAMES,
    ETAD_FACTOR_RENAME
)
from outliers import (
    EXCLUDED_SAMPLES,
    apply_exclusion_flags,
    get_clean_data
)

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

ETAD_COLOR = '#d62728'

# Consistent source colors across all plots
SOURCE_COLORS = {
    'Sea Salt Mixed':          '#1f77b4',
    'Wood Burning':            '#ff7f0e',
    'Charcoal':                '#2ca02c',
    'Polluted Marine':         '#9467bd',
    'Fossil Fuel Combustion':  '#d62728',
}

def col_to_source(col):
    """Extract source name from a renamed column like 'GF1 (Sea Salt Mixed)'."""
    for name in ETAD_PMF_SOURCE_NAMES.values():
        if name in col:
            return name
    return col

print("Imports successful!")
print(f"ETAD config: {SITES['Addis_Ababa']}")
print(f"\nPMF Source Fractions:")
for fnum, name in ETAD_PMF_SOURCE_NAMES.items():
    print(f"  F{fnum}: {name}")

## Load Data

In [None]:
# Load ETAD factor contributions (columns auto-renamed to source names)
factors_df = load_etad_factor_contributions()
print(f"\nColumns: {list(factors_df.columns)}")
print(f"\nFirst 5 rows:")
factors_df.head()

In [None]:
# Summary statistics for source contributions
print("=" * 80)
print("SOURCE CONTRIBUTION SUMMARY STATISTICS")
print("=" * 80)

gf_cols = [c for c in factors_df.columns if c.startswith('GF')]
kf_cols = [c for c in factors_df.columns if c.startswith('K_F')]

print("\n--- Source Contribution Fractions (GF) ---")
print(factors_df[gf_cols].describe().round(4))

print("\n--- Source Concentrations (ug/m3) ---")
print(factors_df[kf_cols].describe().round(4))

In [None]:
# Load aethalometer data (all sites)
aethalometer_data = load_aethalometer_data()
print(f"\nLoaded aethalometer data for: {list(aethalometer_data.keys())}")

In [None]:
# Load filter data
filter_data = load_filter_data()
filter_data = add_base_filter_id(filter_data)
print(f"Filter data: {len(filter_data)} records")
print(f"Parameters: {filter_data['Parameter'].unique()}")

In [None]:
# Match all parameters for Addis Ababa (ETAD)
site_name = 'Addis_Ababa'
site_code = SITES[site_name]['code']  # 'ETAD'

if site_name in aethalometer_data:
    df_aeth = aethalometer_data[site_name]
    etad_matched = match_all_parameters(
        site_name, site_code, df_aeth, filter_data
    )
    
    if etad_matched is not None:
        # Add derived columns
        if 'iron' in etad_matched.columns and 'ftir_ec' in etad_matched.columns:
            etad_matched['iron_ec_ratio'] = etad_matched['iron'] / etad_matched['ftir_ec']
        if 'iron' in etad_matched.columns and 'ir_bcc' in etad_matched.columns:
            etad_matched['iron_aeth_ratio'] = etad_matched['iron'] / etad_matched['ir_bcc']
        
        available = [col for col in ['ir_bcc', 'hips_fabs', 'ftir_ec', 'iron', 'iron_ec_ratio'] 
                     if col in etad_matched.columns and etad_matched[col].notna().any()]
        print(f"\n{site_name}: {len(etad_matched)} matched samples")
        print(f"Available parameters: {', '.join(available)}")
        print(f"Date range: {etad_matched['date'].min().date()} to {etad_matched['date'].max().date()}")
    else:
        print(f"{site_name}: No matched data")
else:
    print(f"{site_name}: No aethalometer data")

---

# ETAD Filter Sample Coverage vs PMF Analysis

**Goal**: See all filter samples collected in Ethiopia (ETAD), identify which ones
Naveed has run PMF on, and visualize the coverage on a timeline and cross plot.

In [None]:
# Get ALL ETAD filter samples from the unified filter dataset
print("=" * 80)
print("ALL ETAD FILTER SAMPLES")
print("=" * 80)

etad_all_filters = filter_data[filter_data['Site'] == site_code].copy()
print(f"\nTotal ETAD filter measurements: {len(etad_all_filters)}")
print(f"Parameters measured: {sorted(etad_all_filters['Parameter'].unique())}")
print(f"Date range: {etad_all_filters['SampleDate'].min().date()} to {etad_all_filters['SampleDate'].max().date()}")

# Pivot so each row = one filter sample with all its parameters
etad_pivoted = pivot_filter_by_id(filter_data, site_code)

print(f"\nUnique filter samples (by base_filter_id): {len(etad_pivoted)}")
print(f"Date range: {etad_pivoted['date'].min().date()} to {etad_pivoted['date'].max().date()}")
print(f"\nColumns available per filter:")
for col in etad_pivoted.columns:
    if col not in ['base_filter_id', 'date']:
        n_valid = etad_pivoted[col].notna().sum()
        print(f"  {col}: {n_valid} samples ({100*n_valid/len(etad_pivoted):.0f}%)")

etad_pivoted.head()

In [None]:
# Identify which filter samples have PMF factor contributions from Naveed
print("=" * 80)
print("PMF COVERAGE: WHICH SAMPLES DID NAVEED ANALYZE?")
print("=" * 80)

pmf_dates = set(factors_df['date'])
tolerance = pd.Timedelta(days=1)

def has_pmf(filter_date):
    """Check if a filter date has a corresponding PMF analysis."""
    for pmf_date in pmf_dates:
        if abs(filter_date - pmf_date) <= tolerance:
            return True
    return False

etad_pivoted['has_pmf'] = etad_pivoted['date'].apply(has_pmf)

n_total = len(etad_pivoted)
n_pmf = etad_pivoted['has_pmf'].sum()
n_no_pmf = n_total - n_pmf

print(f"\nTotal ETAD filter samples:      {n_total}")
print(f"With PMF analysis (Naveed):      {n_pmf} ({100*n_pmf/n_total:.1f}%)")
print(f"Without PMF analysis:            {n_no_pmf} ({100*n_no_pmf/n_total:.1f}%)")

print(f"\nPMF factor contributions file:   {len(factors_df)} dates")
print(f"PMF date range:                  {factors_df['date'].min().date()} to {factors_df['date'].max().date()}")
print(f"All filter date range:           {etad_pivoted['date'].min().date()} to {etad_pivoted['date'].max().date()}")

filter_dates_set = set(etad_pivoted['date'])
pmf_unmatched = []
for pd_date in sorted(pmf_dates):
    matched = any(abs(pd_date - fd) <= tolerance for fd in filter_dates_set)
    if not matched:
        pmf_unmatched.append(pd_date)

if pmf_unmatched:
    print(f"\nPMF dates with no filter match ({len(pmf_unmatched)}):")
    for d in pmf_unmatched:
        print(f"  {d.date()}")
else:
    print(f"\nAll PMF dates matched to a filter sample.")

In [None]:
# Timeline: all ETAD filter samples, highlighting PMF-analyzed ones
print("=" * 80)
print("TIMELINE: ALL ETAD SAMPLES vs PMF-ANALYZED")
print("=" * 80)

fig, ax = plt.subplots(figsize=(16, 5))

no_pmf = etad_pivoted[~etad_pivoted['has_pmf']]
with_pmf = etad_pivoted[etad_pivoted['has_pmf']]

ax.scatter(no_pmf['date'], [1] * len(no_pmf), marker='|', s=200, linewidths=1.5,
           color='#bbbbbb', label=f'No PMF (n={len(no_pmf)})', zorder=2)
ax.scatter(with_pmf['date'], [1] * len(with_pmf), marker='|', s=200, linewidths=2,
           color=ETAD_COLOR, label=f'PMF analyzed (n={len(with_pmf)})', zorder=3)

pmf_min = factors_df['date'].min()
pmf_max = factors_df['date'].max()
ax.axvspan(pmf_min, pmf_max, alpha=0.08, color=ETAD_COLOR, label='PMF date range')

ax.set_yticks([])
ax.set_xlabel('Sample Date')
ax.set_title(f'ETAD Filter Samples: All ({n_total}) vs PMF-Analyzed ({n_pmf})')
ax.legend(loc='upper left', fontsize=10)
ax.grid(True, axis='x', alpha=0.3)

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

In [None]:
# Multi-row timeline: one row per parameter showing coverage
print("=" * 80)
print("TIMELINE BY PARAMETER")
print("=" * 80)

param_cols = [c for c in etad_pivoted.columns
              if c not in ['base_filter_id', 'date', 'has_pmf']]

fig, ax = plt.subplots(figsize=(16, 1.2 + 0.8 * len(param_cols)))

yticks = []
ytick_labels = []

for i, param in enumerate(param_cols):
    y_pos = i
    yticks.append(y_pos)
    ytick_labels.append(param)
    
    has_param = etad_pivoted[etad_pivoted[param].notna()]
    has_param_pmf = has_param[has_param['has_pmf']]
    has_param_no_pmf = has_param[~has_param['has_pmf']]
    
    ax.scatter(has_param_no_pmf['date'], [y_pos] * len(has_param_no_pmf),
              marker='|', s=120, linewidths=1.2, color='#bbbbbb', zorder=2)
    ax.scatter(has_param_pmf['date'], [y_pos] * len(has_param_pmf),
              marker='|', s=120, linewidths=1.5, color=ETAD_COLOR, zorder=3)

ax.scatter([], [], marker='|', s=120, linewidths=1.5, color='#bbbbbb', label='No PMF')
ax.scatter([], [], marker='|', s=120, linewidths=1.5, color=ETAD_COLOR, label='PMF analyzed')

ax.set_yticks(yticks)
ax.set_yticklabels(ytick_labels, fontsize=9)
ax.set_xlabel('Sample Date')
ax.set_title('ETAD: Parameter Coverage Timeline (All Filters vs PMF-Analyzed)')
ax.legend(loc='upper left', fontsize=9)
ax.grid(True, axis='x', alpha=0.3)
ax.invert_yaxis()

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

In [None]:
# Cross plot: FTIR EC vs HIPS Fabs/MAC - PMF vs non-PMF samples
print("=" * 80)
print("CROSS PLOT: FTIR EC vs HIPS Fabs/MAC (PMF vs NON-PMF)")
print("=" * 80)

# Explicit column selection (prefer hips_fabs over hips_tau)
if 'ftir_ec' in etad_pivoted.columns:
    ftir_col = 'ftir_ec'
else:
    ftir_col = next((c for c in etad_pivoted.columns
                     if 'ftir' in c.lower() or 'ec_ftir' in c.lower()), None)

if 'hips_fabs' in etad_pivoted.columns:
    hips_col = 'hips_fabs'
else:
    hips_col = next((c for c in etad_pivoted.columns if 'fabs' in c.lower()), None)

print(f"FTIR EC column: {ftir_col}")
print(f"HIPS column: {hips_col}")
print(f"MAC value: {MAC_VALUE} m\u00b2/g")

if ftir_col and hips_col:
    # pivot_filter_by_id stores raw HIPS Fabs (Mm^-1);
    # divide by MAC_VALUE to convert to \u03bcg/m\u00b3 (same units as FTIR EC)
    valid = etad_pivoted[[ftir_col, hips_col, 'has_pmf', 'date']].dropna(subset=[ftir_col, hips_col]).copy()
    valid[hips_col] = valid[hips_col] / MAC_VALUE

    pmf_data = valid[valid['has_pmf']]
    no_pmf_data = valid[~valid['has_pmf']]
    
    print(f"\nSamples with both FTIR EC and HIPS Fabs: {len(valid)}")
    print(f"  PMF-analyzed:     {len(pmf_data)}")
    print(f"  Not PMF-analyzed: {len(no_pmf_data)}")
    
    # --- Build labels ---
    hips_label = 'HIPS Fabs / MAC (\u03bcg/m\u00b3)'
    ftir_label = 'FTIR EC (\u03bcg/m\u00b3)'
    
    fig, ax = plt.subplots(figsize=(8, 7))
    
    # Scatter: non-PMF
    if len(no_pmf_data) > 0:
        ax.scatter(no_pmf_data[ftir_col], no_pmf_data[hips_col],
                   c='#cccccc', edgecolors='#999999', linewidths=0.4,
                   s=40, alpha=0.7, label=f'No PMF (n={len(no_pmf_data)})', zorder=2)
    # Scatter: PMF
    if len(pmf_data) > 0:
        ax.scatter(pmf_data[ftir_col], pmf_data[hips_col],
                   c=ETAD_COLOR, edgecolors='k', linewidths=0.4,
                   s=55, alpha=0.85, label=f'PMF analyzed (n={len(pmf_data)})', zorder=3)
    
    # --- Regression: all data ---
    x_all = valid[ftir_col].values
    y_all = valid[hips_col].values
    slope_all, int_all, r_all, p_all, _ = stats.linregress(x_all, y_all)
    x_fit = np.linspace(x_all.min(), x_all.max(), 100)
    ax.plot(x_fit, slope_all * x_fit + int_all, 'k-', linewidth=1.5, alpha=0.6, zorder=4)
    
    # --- Regression: PMF only ---
    pmf_stats_text = ''
    if len(pmf_data) >= 3:
        x_pmf = pmf_data[ftir_col].values
        y_pmf = pmf_data[hips_col].values
        slope_pmf, int_pmf, r_pmf, p_pmf, _ = stats.linregress(x_pmf, y_pmf)
        ax.plot(x_fit, slope_pmf * x_fit + int_pmf, '--', color=ETAD_COLOR,
                linewidth=1.5, alpha=0.8, zorder=4)
        pmf_stats_text = (
            f'PMF only (n={len(pmf_data)}):\n'
            f'  y = {slope_pmf:.3f}x + {int_pmf:.3f}\n'
            f'  R\u00b2 = {r_pmf**2:.3f}'
        )
    
    # --- Annotation box with regression stats ---
    all_stats_text = (
        f'All samples (n={len(valid)}):\n'
        f'  y = {slope_all:.3f}x + {int_all:.3f}\n'
        f'  R\u00b2 = {r_all**2:.3f}'
    )
    stats_text = all_stats_text
    if pmf_stats_text:
        stats_text += '\n\n' + pmf_stats_text
    
    ax.text(0.03, 0.97, stats_text, transform=ax.transAxes,
            fontsize=9, verticalalignment='top',
            bbox=dict(boxstyle='round,pad=0.4', facecolor='white',
                      edgecolor='#cccccc', alpha=0.9))
    
    # --- Axis limits: start at origin, extend to data max + padding ---
    x_max = x_all.max() * 1.05
    y_max = y_all.max() * 1.05
    ax.set_xlim(0, x_max)
    ax.set_ylim(0, y_max)
    
    ax.set_xlabel(ftir_label, fontsize=12)
    ax.set_ylabel(hips_label, fontsize=12)
    ax.set_title(f'ETAD: {ftir_label} vs {hips_label}\nPMF-Analyzed Samples Highlighted',
                 fontsize=13, fontweight='bold')
    ax.legend(loc='lower right', fontsize=9, framealpha=0.9)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Could not find both FTIR EC and HIPS Fabs columns in pivoted data.")

In [None]:
# Summary table: PMF vs non-PMF sample statistics
print("=" * 80)
print("PMF vs NON-PMF SAMPLE COMPARISON")
print("=" * 80)

numeric_cols = [c for c in etad_pivoted.columns
                if c not in ['base_filter_id', 'date', 'has_pmf']]

pmf_subset = etad_pivoted[etad_pivoted['has_pmf']]
non_pmf_subset = etad_pivoted[~etad_pivoted['has_pmf']]

print(f"\n{'Parameter':<25s} {'PMF mean':>10s} {'PMF n':>6s} {'Non-PMF mean':>13s} {'Non-PMF n':>10s}")
print("-" * 70)
for col in numeric_cols:
    pmf_vals = pmf_subset[col].dropna()
    non_pmf_vals = non_pmf_subset[col].dropna()
    pmf_mean = f"{pmf_vals.mean():.4f}" if len(pmf_vals) > 0 else "--"
    non_pmf_mean = f"{non_pmf_vals.mean():.4f}" if len(non_pmf_vals) > 0 else "--"
    print(f"{col:<25s} {pmf_mean:>10s} {len(pmf_vals):>6d} {non_pmf_mean:>13s} {len(non_pmf_vals):>10d}")

---

# Match Source Contributions to Filter Data by Date

**Goal**: Merge the ETAD PMF source contributions with the matched filter/aethalometer data
using the `match_etad_factors` function (date tolerance: +/- 1 day).

In [None]:
# Match ETAD source contributions to the multi-parameter matched dataset
print("=" * 80)
print("DATE MATCHING: ETAD SOURCES <-> FILTER/AETH DATA")
print("=" * 80)

etad_with_factors = match_etad_factors(
    etad_matched,
    target_date_col='date',
    date_tolerance_days=1
)

if etad_with_factors is not None:
    print(f"\nResulting columns: {list(etad_with_factors.columns)}")
    print(f"\nFirst 5 rows:")
    display_cols = ['date', 'ir_bcc', 'ftir_ec', 'hips_fabs', 'iron'] + gf_cols[:3]
    display_cols = [c for c in display_cols if c in etad_with_factors.columns]
    etad_with_factors[display_cols].head(10)

In [None]:
# Date matching diagnostics
print("=" * 80)
print("DATE MATCHING DIAGNOSTICS")
print("=" * 80)

if etad_with_factors is not None and 'factor_date' in etad_with_factors.columns:
    date_diffs = (etad_with_factors['factor_date'] - etad_with_factors['date']).dt.days
    print(f"\nDate difference (factor_date - date):")
    print(f"  Exact matches (0 days): {(date_diffs == 0).sum()}")
    print(f"  Off by 1 day:           {(date_diffs.abs() == 1).sum()}")
    print(f"  Mean offset:            {date_diffs.mean():.2f} days")
    
    matched_dates = set(etad_with_factors['date'])
    all_dates = set(etad_matched['date'])
    unmatched = sorted(all_dates - matched_dates)
    print(f"\nUnmatched filter dates ({len(unmatched)} of {len(all_dates)}):")
    for d in unmatched[:10]:
        print(f"  {d.date()}")
    if len(unmatched) > 10:
        print(f"  ... and {len(unmatched) - 10} more")

---

# Source Contributions Over Time

**Goal**: Visualize how the 5 PMF sources evolve across the sampling period.
- **GF columns**: daily source contribution fractions (sum to ~1)
- **K_F columns**: daily source concentrations in ug/m3

In [None]:
# Time series of source contributions
print("=" * 80)
print("SOURCE CONTRIBUTIONS OVER TIME")
print("=" * 80)

fig, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# Source contribution fractions
for col in gf_cols:
    source = col_to_source(col)
    axes[0].plot(factors_df['date'], factors_df[col], marker='o', markersize=3,
                 label=source, color=SOURCE_COLORS.get(source), alpha=0.7)
axes[0].set_ylabel('Source Contribution Fraction')
axes[0].set_title('ETAD: Daily Source Contribution Fractions')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Source concentrations
for col in kf_cols:
    source = col_to_source(col)
    axes[1].plot(factors_df['date'], factors_df[col], marker='o', markersize=3,
                 label=source, color=SOURCE_COLORS.get(source), alpha=0.7)
axes[1].set_ylabel('Source Concentration (ug/m3)')
axes[1].set_xlabel('Date')
axes[1].set_title('ETAD: Daily Source Concentrations (ug/m3)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Stacked area plot of source contribution fractions
fig, ax = plt.subplots(figsize=(14, 6))

dates = factors_df['date']
gf_data = factors_df[gf_cols].values.T
source_labels = [col_to_source(c) for c in gf_cols]
source_clrs = [SOURCE_COLORS.get(s, '#999') for s in source_labels]

ax.stackplot(dates, gf_data, labels=source_labels, colors=source_clrs, alpha=0.7)
ax.set_ylabel('Source Contribution Fraction (stacked)')
ax.set_xlabel('Date')
ax.set_title('ETAD: Stacked Source Contribution Fractions')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

# Source Concentrations vs Filter Parameters

**Goal**: Examine relationships between PMF source concentrations and measured parameters
(FTIR EC, HIPS Fabs, Aethalometer IR BCc, Iron).

In [None]:
# Correlation matrix between sources and filter parameters
print("=" * 80)
print("CORRELATION: SOURCE CONCENTRATIONS vs FILTER PARAMETERS")
print("=" * 80)

# Readable labels for filter parameters
PARAM_LABELS = {
    'ir_bcc': 'IR BCc (\u03bcg/m\u00b3)',
    'hips_fabs': 'HIPS Fabs/MAC (\u03bcg/m\u00b3)',
    'ftir_ec': 'FTIR EC (\u03bcg/m\u00b3)',
    'iron': 'Iron (\u03bcg/m\u00b3)',
}

if etad_with_factors is not None:
    param_cols = [c for c in ['ir_bcc', 'hips_fabs', 'ftir_ec', 'iron']
                  if c in etad_with_factors.columns]
    factor_cols_all = gf_cols + kf_cols
    corr_cols = param_cols + [c for c in factor_cols_all if c in etad_with_factors.columns]
    
    corr_matrix = etad_with_factors[corr_cols].corr()
    
    # Build readable tick labels
    short_labels = []
    for c in corr_cols:
        if c in param_cols:
            short_labels.append(PARAM_LABELS.get(c, c))
        else:
            prefix = c.split(' ')[0]  # 'GF1' or 'K_F1'
            source = col_to_source(c)
            short_labels.append(f'{prefix}\n{source}')
    
    fig, ax = plt.subplots(figsize=(14, 11))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r',
                center=0, vmin=-1, vmax=1, ax=ax,
                square=True, linewidths=0.5,
                xticklabels=short_labels, yticklabels=short_labels)
    ax.set_title('Correlation: PMF Sources vs Filter Parameters (ETAD)')
    plt.tight_layout()
    plt.show()
    
    print("\nStrongest source-parameter correlations:")
    for param in param_cols:
        for fc in factor_cols_all:
            if fc in corr_matrix.columns:
                r = corr_matrix.loc[param, fc]
                if abs(r) > 0.3:
                    print(f"  {PARAM_LABELS.get(param, param)} vs {col_to_source(fc)}: r = {r:.3f}")

In [None]:
# Scatter plot helper: source concentration vs filter parameter
def plot_source_vs_param(df, source_col, param_col, param_label):
    """Scatter plot of a source concentration vs a filter parameter with regression."""
    valid = df[[source_col, param_col]].dropna()
    if len(valid) < 3:
        return
    
    source_name = col_to_source(source_col)
    x = valid[source_col].values
    y = valid[param_col].values
    
    slope, intercept, r, p, se = stats.linregress(x, y)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(x, y, alpha=0.6, color=SOURCE_COLORS.get(source_name, ETAD_COLOR),
               edgecolors='k', linewidths=0.3)
    
    x_fit = np.linspace(x.min(), x.max(), 100)
    ax.plot(x_fit, slope * x_fit + intercept, 'k--', alpha=0.7,
            label=f'y = {slope:.2f}x + {intercept:.2f}\nR\u00b2 = {r**2:.3f}, p = {p:.2e}\nn = {len(valid)}')
    
    ax.set_xlim(0, x.max() * 1.05)
    ax.set_ylim(0, y.max() * 1.05)
    ax.set_xlabel(f'{source_name} (\u03bcg/m\u00b3)')
    ax.set_ylabel(param_label)
    ax.set_title(f'ETAD: {source_name} vs {param_label}')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Source concentrations vs FTIR EC
print("=" * 80)
print("SOURCE CONCENTRATIONS vs FTIR EC")
print("=" * 80)

if etad_with_factors is not None and 'ftir_ec' in etad_with_factors.columns:
    for kf in kf_cols:
        if kf in etad_with_factors.columns:
            plot_source_vs_param(etad_with_factors, kf, 'ftir_ec', 'FTIR EC (ug/m3)')

In [None]:
# Source concentrations vs Aethalometer IR BCc
print("=" * 80)
print("SOURCE CONCENTRATIONS vs AETHALOMETER IR BCc")
print("=" * 80)

if etad_with_factors is not None and 'ir_bcc' in etad_with_factors.columns:
    for kf in kf_cols:
        if kf in etad_with_factors.columns:
            plot_source_vs_param(etad_with_factors, kf, 'ir_bcc', 'IR BCc (ug/m3)')

In [None]:
# Source concentrations vs HIPS Fabs
print("=" * 80)
print("SOURCE CONCENTRATIONS vs HIPS Fabs")
print("=" * 80)

if etad_with_factors is not None and 'hips_fabs' in etad_with_factors.columns:
    for kf in kf_cols:
        if kf in etad_with_factors.columns:
            plot_source_vs_param(etad_with_factors, kf, 'hips_fabs', 'HIPS Fabs / MAC (ug/m3)')

In [None]:
# Source concentrations vs Iron
print("=" * 80)
print("SOURCE CONCENTRATIONS vs IRON")
print("=" * 80)

if etad_with_factors is not None and 'iron' in etad_with_factors.columns:
    for kf in kf_cols:
        if kf in etad_with_factors.columns:
            plot_source_vs_param(etad_with_factors, kf, 'iron', 'Iron (ug/m3)')

---

# Dominant Source Analysis

**Goal**: Identify which source dominates on each sampling date and examine
whether the dominant source relates to measurement agreement.

In [None]:
# Identify dominant source per sample
print("=" * 80)
print("DOMINANT SOURCE ANALYSIS")
print("=" * 80)

if etad_with_factors is not None:
    gf_in_matched = [c for c in gf_cols if c in etad_with_factors.columns]
    kf_in_matched = [c for c in kf_cols if c in etad_with_factors.columns]
    
    if gf_in_matched:
        etad_with_factors['dominant_gf'] = etad_with_factors[gf_in_matched].idxmax(axis=1)
        etad_with_factors['dominant_source_frac'] = etad_with_factors['dominant_gf'].map(col_to_source)
        print("\nDominant source by contribution fraction:")
        print(etad_with_factors['dominant_source_frac'].value_counts())
    
    if kf_in_matched:
        etad_with_factors['dominant_kf'] = etad_with_factors[kf_in_matched].idxmax(axis=1)
        etad_with_factors['dominant_source_conc'] = etad_with_factors['dominant_kf'].map(col_to_source)
        print("\nDominant source by concentration:")
        print(etad_with_factors['dominant_source_conc'].value_counts())

In [None]:
# Scatter plots colored by dominant source
if etad_with_factors is not None and 'dominant_source_conc' in etad_with_factors.columns:
    plot_pairs = [
        ('ftir_ec', 'ir_bcc', 'FTIR EC (\u03bcg/m\u00b3)', 'IR BCc (\u03bcg/m\u00b3)'),
        ('hips_fabs', 'ftir_ec', 'HIPS Fabs / MAC (\u03bcg/m\u00b3)', 'FTIR EC (\u03bcg/m\u00b3)'),
        ('hips_fabs', 'ir_bcc', 'HIPS Fabs / MAC (\u03bcg/m\u00b3)', 'IR BCc (\u03bcg/m\u00b3)')
    ]
    
    for x_col, y_col, xlabel, ylabel in plot_pairs:
        if x_col not in etad_with_factors.columns or y_col not in etad_with_factors.columns:
            continue
        
        valid = etad_with_factors[[x_col, y_col, 'dominant_source_conc']].dropna()
        if len(valid) < 3:
            continue
        
        fig, ax = plt.subplots(figsize=(9, 7))
        
        for source_name, group in valid.groupby('dominant_source_conc'):
            color = SOURCE_COLORS.get(source_name, '#999999')
            ax.scatter(group[x_col], group[y_col], c=color,
                       label=f'{source_name} (n={len(group)})',
                       alpha=0.7, edgecolors='k', linewidths=0.3, s=50)
        
        # 1:1 line from origin to data max
        ax_max = max(valid[x_col].max(), valid[y_col].max()) * 1.05
        ax.plot([0, ax_max], [0, ax_max], 'k--', alpha=0.3, label='1:1')
        
        ax.set_xlim(0, valid[x_col].max() * 1.05)
        ax.set_ylim(0, valid[y_col].max() * 1.05)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_title(f'ETAD: {ylabel} vs {xlabel}\nColored by Dominant Source (concentration)')
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

---

# Summary

**Goal**: Print key findings from the source-matched analysis.

In [None]:
# Summary
print("=" * 80)
print("SUMMARY: ETAD PMF SOURCE ANALYSIS")
print("=" * 80)

print(f"\nPMF source fractions (Naveed):")
for fnum, name in ETAD_PMF_SOURCE_NAMES.items():
    print(f"  F{fnum}: {name}")

print(f"\nFactor contributions file: {len(factors_df)} records")
print(f"  Date range: {factors_df['date'].min().date()} to {factors_df['date'].max().date()}")

print(f"\nAll ETAD filter samples: {n_total}")
print(f"  PMF-analyzed (Naveed): {n_pmf} ({100*n_pmf/n_total:.1f}%)")
print(f"  Not PMF-analyzed:      {n_no_pmf} ({100*n_no_pmf/n_total:.1f}%)")

if etad_with_factors is not None:
    print(f"\nMatched to filter/aeth data: {len(etad_with_factors)} records")
    print(f"  Match rate: {len(etad_with_factors)}/{len(etad_matched)} "
          f"({100*len(etad_with_factors)/len(etad_matched):.1f}%)")
    
    param_cols = [c for c in ['ir_bcc', 'hips_fabs', 'ftir_ec', 'iron']
                  if c in etad_with_factors.columns]
    kf_in_data = [c for c in kf_cols if c in etad_with_factors.columns]
    
    if param_cols and kf_in_data:
        print("\nTop source-parameter correlations (|r| > 0.3):")
        for param in param_cols:
            for kf in kf_in_data:
                valid = etad_with_factors[[param, kf]].dropna()
                if len(valid) >= 5:
                    r, p = stats.pearsonr(valid[param], valid[kf])
                    if abs(r) > 0.3:
                        source = col_to_source(kf)
                        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
                        print(f"  {param:12s} vs {source:25s}: r = {r:+.3f} (p = {p:.2e}) {sig}")
    
    if 'dominant_source_conc' in etad_with_factors.columns:
        print("\nDominant source breakdown (by concentration):")
        counts = etad_with_factors['dominant_source_conc'].value_counts()
        for source, count in counts.items():
            print(f"  {source}: {count} samples ({100*count/len(etad_with_factors):.1f}%)")

print("\n" + "=" * 80)

---

# Source-Based Threshold & Residual Analysis

**Key Hypothesis** (from meeting notes): *Charcoal/wood burning produces EC with different
optical properties than diesel/fossil fuel EC. If true, samples dominated by biomass burning
should show different FTIR EC vs Aethalometer BC slopes compared to fossil-fuel-dominated samples.*

## Analyses:
1. **Combined biomass metric** — Wood Burning (F2) + Charcoal (F3) fraction
2. **Threshold analysis** — Compare FTIR EC vs Aeth BC slopes at 25%, 50%, 75% biomass
3. **Residual correlation** — (FTIR EC − Aeth BC) vs biomass percentage
4. **Seasonal integration** — Test whether Kiremt season corresponds to high biomass burning

In [None]:
# Combined biomass metric + threshold scatter analysis
print("=" * 80)
print("BIOMASS THRESHOLD ANALYSIS")
print("=" * 80)

if etad_with_factors is not None:
    # Find the GF columns for Wood Burning and Charcoal
    gf_wood = next((c for c in gf_cols if 'Wood Burning' in c), None)
    gf_charcoal = next((c for c in gf_cols if 'Charcoal' in c), None)

    if gf_wood and gf_charcoal:
        etad_with_factors['biomass_pct'] = (
            etad_with_factors[gf_wood] + etad_with_factors[gf_charcoal]
        ) * 100  # fraction -> %

        print(f"Biomass = {col_to_source(gf_wood)} + {col_to_source(gf_charcoal)}")
        print(f"\nBiomass % statistics:")
        print(etad_with_factors['biomass_pct'].describe().round(1))

        # --- Threshold scatter plots: FTIR EC vs Aeth BC ---
        if 'ftir_ec' in etad_with_factors.columns and 'ir_bcc' in etad_with_factors.columns:
            thresholds = [25, 50, 75]
            valid_th = etad_with_factors[['ftir_ec', 'ir_bcc', 'biomass_pct']].dropna()

            fig, axes = plt.subplots(1, len(thresholds), figsize=(6 * len(thresholds), 5.5),
                                     sharey=True)

            for i, thresh in enumerate(thresholds):
                ax = axes[i]
                low = valid_th[valid_th['biomass_pct'] <= thresh]
                high = valid_th[valid_th['biomass_pct'] > thresh]

                x_max = valid_th['ftir_ec'].max() * 1.05
                y_max = valid_th['ir_bcc'].max() * 1.05
                x_fit = np.linspace(0, x_max, 100)

                # Low biomass
                stats_lines = []
                if len(low) >= 3:
                    ax.scatter(low['ftir_ec'], low['ir_bcc'],
                               c='#1f77b4', edgecolors='k', linewidths=0.3,
                               s=40, alpha=0.7, label=f'\u2264{thresh}% (n={len(low)})')
                    sl, il, rl, _, _ = stats.linregress(low['ftir_ec'], low['ir_bcc'])
                    ax.plot(x_fit, sl * x_fit + il, '-', color='#1f77b4', alpha=0.7)
                    stats_lines.append(f'Low: R\u00b2={rl**2:.3f}, slope={sl:.3f}')

                # High biomass
                if len(high) >= 3:
                    ax.scatter(high['ftir_ec'], high['ir_bcc'],
                               c='#d62728', edgecolors='k', linewidths=0.3,
                               s=40, alpha=0.7, label=f'>{thresh}% (n={len(high)})')
                    sh, ih, rh, _, _ = stats.linregress(high['ftir_ec'], high['ir_bcc'])
                    ax.plot(x_fit, sh * x_fit + ih, '-', color='#d62728', alpha=0.7)
                    stats_lines.append(f'High: R\u00b2={rh**2:.3f}, slope={sh:.3f}')

                # 1:1 line
                ax_lim = max(x_max, y_max)
                ax.plot([0, ax_lim], [0, ax_lim], 'k:', alpha=0.3, label='1:1')

                ax.set_xlim(0, x_max)
                ax.set_ylim(0, y_max)
                ax.set_xlabel('FTIR EC (\u03bcg/m\u00b3)')
                if i == 0:
                    ax.set_ylabel('IR BCc (\u03bcg/m\u00b3)')
                ax.set_title(f'Biomass threshold: {thresh}%')

                ax.text(0.03, 0.97, '\n'.join(stats_lines), transform=ax.transAxes,
                        fontsize=8, va='top',
                        bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='#ccc', alpha=0.9))
                ax.legend(fontsize=8, loc='lower right')
                ax.grid(True, alpha=0.3)

            fig.suptitle('FTIR EC vs Aeth IR BCc: Low vs High Biomass Burning',
                         fontsize=13, fontweight='bold', y=1.02)
            plt.tight_layout()
            plt.show()

            # --- Summary table ---
            print(f"\n{'Threshold':<15s} {'Low n':>6s} {'Low R\u00b2':>8s} {'Low slope':>10s} "
                  f"{'High n':>7s} {'High R\u00b2':>8s} {'High slope':>11s}")
            print("-" * 70)
            for thresh in thresholds:
                low = valid_th[valid_th['biomass_pct'] <= thresh]
                high = valid_th[valid_th['biomass_pct'] > thresh]
                low_str = ""
                high_str = ""
                if len(low) >= 3:
                    sl, il, rl, _, _ = stats.linregress(low['ftir_ec'], low['ir_bcc'])
                    low_str = f"{len(low):>6d} {rl**2:>8.3f} {sl:>10.3f}"
                else:
                    low_str = f"{len(low):>6d}      --         --"
                if len(high) >= 3:
                    sh, ih, rh, _, _ = stats.linregress(high['ftir_ec'], high['ir_bcc'])
                    high_str = f"{len(high):>7d} {rh**2:>8.3f} {sh:>11.3f}"
                else:
                    high_str = f"{len(high):>7d}      --          --"
                label = f"\u2264{thresh}% / >{thresh}%"
                print(f"{label:<15s} {low_str} {high_str}")
    else:
        print("Could not find Wood Burning and Charcoal GF columns")

In [None]:
# Residual analysis: (FTIR EC - Aeth BC) vs biomass percentage
print("=" * 80)
print("RESIDUAL vs BIOMASS PERCENTAGE")
print("=" * 80)

if etad_with_factors is not None and 'biomass_pct' in etad_with_factors.columns:
    if 'ftir_ec' in etad_with_factors.columns and 'ir_bcc' in etad_with_factors.columns:
        df_res = etad_with_factors[['ftir_ec', 'ir_bcc', 'biomass_pct']].dropna().copy()
        df_res['residual'] = df_res['ftir_ec'] - df_res['ir_bcc']
        df_res['residual_pct'] = (df_res['residual'] / df_res['ftir_ec']) * 100

        print(f"Residual (FTIR EC \u2212 Aeth BC) statistics:")
        print(f"  n = {len(df_res)}")
        print(f"  Mean:   {df_res['residual'].mean():.3f} \u03bcg/m\u00b3")
        print(f"  Median: {df_res['residual'].median():.3f} \u03bcg/m\u00b3")
        print(f"  Std:    {df_res['residual'].std():.3f} \u03bcg/m\u00b3")

        fig, axes = plt.subplots(1, 2, figsize=(14, 6))

        # --- Left: absolute residual ---
        ax = axes[0]
        ax.scatter(df_res['biomass_pct'], df_res['residual'],
                   c=ETAD_COLOR, edgecolors='k', linewidths=0.3, s=40, alpha=0.7)
        ax.axhline(0, color='k', linestyle=':', alpha=0.3)

        slope, intercept, r, p, _ = stats.linregress(df_res['biomass_pct'], df_res['residual'])
        x_fit = np.linspace(0, df_res['biomass_pct'].max(), 100)
        ax.plot(x_fit, slope * x_fit + intercept, 'k--', alpha=0.7)

        ax.text(0.03, 0.97,
                f'y = {slope:.4f}x + {intercept:.3f}\nR\u00b2 = {r**2:.3f}, p = {p:.2e}\nn = {len(df_res)}',
                transform=ax.transAxes, fontsize=9, va='top',
                bbox=dict(boxstyle='round,pad=0.4', fc='white', ec='#ccc', alpha=0.9))

        ax.set_xlim(0, df_res['biomass_pct'].max() * 1.05)
        ax.set_xlabel('Biomass Burning % (Wood + Charcoal)')
        ax.set_ylabel('Residual: FTIR EC \u2212 Aeth BC (\u03bcg/m\u00b3)')
        ax.set_title('Absolute Residual vs Biomass %')
        ax.grid(True, alpha=0.3)

        # --- Right: percentage residual ---
        ax = axes[1]
        ax.scatter(df_res['biomass_pct'], df_res['residual_pct'],
                   c=ETAD_COLOR, edgecolors='k', linewidths=0.3, s=40, alpha=0.7)
        ax.axhline(0, color='k', linestyle=':', alpha=0.3)

        slope2, int2, r2, p2, _ = stats.linregress(df_res['biomass_pct'], df_res['residual_pct'])
        ax.plot(x_fit, slope2 * x_fit + int2, 'k--', alpha=0.7)

        ax.text(0.03, 0.97,
                f'y = {slope2:.3f}x + {int2:.2f}\nR\u00b2 = {r2**2:.3f}, p = {p2:.2e}\nn = {len(df_res)}',
                transform=ax.transAxes, fontsize=9, va='top',
                bbox=dict(boxstyle='round,pad=0.4', fc='white', ec='#ccc', alpha=0.9))

        ax.set_xlim(0, df_res['biomass_pct'].max() * 1.05)
        ax.set_xlabel('Biomass Burning % (Wood + Charcoal)')
        ax.set_ylabel('Residual: (FTIR EC \u2212 Aeth BC) / FTIR EC  (%)')
        ax.set_title('Percentage Residual vs Biomass %')
        ax.grid(True, alpha=0.3)

        fig.suptitle('Does Biomass Burning Affect FTIR\u2013Aeth Agreement?',
                     fontsize=13, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.show()

        # Interpretation
        if p < 0.05:
            direction = "higher" if slope > 0 else "lower"
            print(f"\nSignificant correlation (p={p:.2e}): "
                  f"Higher biomass % \u2192 {direction} FTIR EC relative to Aeth BC")
        else:
            print(f"\nNo significant correlation (p={p:.2e}) between "
                  f"biomass % and FTIR\u2013Aeth residual")

In [None]:
# Seasonal + source analysis: Ethiopian seasons
print("=" * 80)
print("SEASONAL + SOURCE ANALYSIS")
print("=" * 80)

def assign_ethiopian_season(date):
    """Assign Ethiopian season based on month."""
    month = date.month
    if month in [10, 11, 12, 1]:
        return 'Bega (Oct\u2013Jan, dry)'
    elif month in [2, 3, 4, 5]:
        return 'Belg (Feb\u2013May, short rains)'
    elif month in [6, 7, 8, 9]:
        return 'Kiremt (Jun\u2013Sep, long rains)'

SEASON_COLORS = {
    'Bega (Oct\u2013Jan, dry)':           '#e6a817',
    'Belg (Feb\u2013May, short rains)':   '#2ca02c',
    'Kiremt (Jun\u2013Sep, long rains)':  '#1f77b4',
}

SEASON_ORDER = [
    'Bega (Oct\u2013Jan, dry)',
    'Belg (Feb\u2013May, short rains)',
    'Kiremt (Jun\u2013Sep, long rains)',
]

if etad_with_factors is not None and 'biomass_pct' in etad_with_factors.columns:
    etad_with_factors['season'] = etad_with_factors['date'].apply(assign_ethiopian_season)

    print("Samples by season:")
    print(etad_with_factors['season'].value_counts().to_string())

    print(f"\nBiomass % by season:")
    print(etad_with_factors.groupby('season')['biomass_pct']
          .describe()[['count', 'mean', 'std', 'min', 'max']].round(1))

    if 'ftir_ec' in etad_with_factors.columns and 'ir_bcc' in etad_with_factors.columns:
        df_seas = etad_with_factors[['date', 'ftir_ec', 'ir_bcc', 'biomass_pct', 'season']].dropna().copy()
        df_seas['residual'] = df_seas['ftir_ec'] - df_seas['ir_bcc']

        # --- Plot 1: Residual vs Biomass % by season ---
        fig, ax = plt.subplots(figsize=(10, 7))

        for season in SEASON_ORDER:
            group = df_seas[df_seas['season'] == season]
            if len(group) == 0:
                continue
            color = SEASON_COLORS.get(season, '#999')
            ax.scatter(group['biomass_pct'], group['residual'],
                       c=color, edgecolors='k', linewidths=0.3,
                       s=50, alpha=0.7, label=f'{season} (n={len(group)})')

            # Per-season regression line
            if len(group) >= 5:
                sl, il, rl, pl, _ = stats.linregress(group['biomass_pct'], group['residual'])
                x_fit = np.linspace(group['biomass_pct'].min(), group['biomass_pct'].max(), 50)
                ax.plot(x_fit, sl * x_fit + il, '-', color=color, alpha=0.6, linewidth=1.5)

        ax.axhline(0, color='k', linestyle=':', alpha=0.3)
        ax.set_xlim(0, df_seas['biomass_pct'].max() * 1.05)
        ax.set_xlabel('Biomass Burning % (Wood + Charcoal)', fontsize=12)
        ax.set_ylabel('Residual: FTIR EC \u2212 Aeth BC (\u03bcg/m\u00b3)', fontsize=12)
        ax.set_title('FTIR\u2013Aeth Residual vs Biomass % by Ethiopian Season',
                     fontsize=13, fontweight='bold')
        ax.legend(fontsize=9)
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

        # --- Per-season regression stats ---
        print(f"\n{'Season':<35s} {'n':>4s} {'R\u00b2':>7s} {'slope':>8s} {'p-value':>12s} {'Mean BM%':>9s}")
        print("-" * 80)
        for season in SEASON_ORDER:
            group = df_seas[df_seas['season'] == season]
            if len(group) == 0:
                continue
            bm_mean = group['biomass_pct'].mean()
            if len(group) >= 5:
                sl, il, rl, pl, _ = stats.linregress(group['biomass_pct'], group['residual'])
                sig = '***' if pl < 0.001 else '**' if pl < 0.01 else '*' if pl < 0.05 else ''
                print(f"{season:<35s} {len(group):>4d} {rl**2:>7.3f} {sl:>8.4f} {pl:>12.2e} {bm_mean:>8.1f}% {sig}")
            else:
                print(f"{season:<35s} {len(group):>4d}      --       --           -- {bm_mean:>8.1f}%")

        # --- Plot 2: FTIR EC vs Aeth BC by season ---
        seasons_present = [s for s in SEASON_ORDER if s in df_seas['season'].values]
        n_seasons = len(seasons_present)
        fig, axes = plt.subplots(1, n_seasons, figsize=(6 * n_seasons, 5.5), sharey=True)
        if n_seasons == 1:
            axes = [axes]

        for i, season in enumerate(seasons_present):
            ax = axes[i]
            group = df_seas[df_seas['season'] == season]
            color = SEASON_COLORS.get(season, '#999')

            ax.scatter(group['ftir_ec'], group['ir_bcc'],
                       c=color, edgecolors='k', linewidths=0.3, s=45, alpha=0.7)

            if len(group) >= 3:
                sl, il, rl, pl, _ = stats.linregress(group['ftir_ec'], group['ir_bcc'])
                x_fit = np.linspace(0, df_seas['ftir_ec'].max(), 100)
                ax.plot(x_fit, sl * x_fit + il, '-', color=color, linewidth=1.5, alpha=0.7)

                ax.text(0.03, 0.97,
                        f'R\u00b2 = {rl**2:.3f}\nslope = {sl:.3f}\nn = {len(group)}',
                        transform=ax.transAxes, fontsize=9, va='top',
                        bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='#ccc', alpha=0.9))

            ax_max = max(df_seas['ftir_ec'].max(), df_seas['ir_bcc'].max()) * 1.05
            ax.plot([0, ax_max], [0, ax_max], 'k:', alpha=0.3, label='1:1')
            ax.set_xlim(0, df_seas['ftir_ec'].max() * 1.05)
            ax.set_ylim(0, df_seas['ir_bcc'].max() * 1.05)
            ax.set_xlabel('FTIR EC (\u03bcg/m\u00b3)')
            if i == 0:
                ax.set_ylabel('IR BCc (\u03bcg/m\u00b3)')
            ax.set_title(season, fontsize=11)
            ax.legend(fontsize=8, loc='lower right')
            ax.grid(True, alpha=0.3)

        fig.suptitle('FTIR EC vs Aeth IR BCc by Ethiopian Season',
                     fontsize=13, fontweight='bold', y=1.02)
        plt.tight_layout()
        plt.show()