# Figure 7: Daily Contribution of Sources to OM in Addis Ababa in 2023

Recreates Figure 7 from Naveed's analysis following Ann's instructions:
- **Bars, not lines** — data is 1-in-3 day sampling (not continuous)
- **Stacking order**: Fossil fuel at bottom, charcoal at top — makes seasonal patterns clearer
- **Y-axis**: Factor Fraction (fraction of OM from each source)
- **Season strip** at top showing Ethiopian seasons

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
import warnings
warnings.filterwarnings('ignore')

# Add scripts folder to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

from config import SITES, MAC_VALUE
from data_matching import (
    load_aethalometer_data,
    load_filter_data,
    add_base_filter_id,
    match_all_parameters,
    load_etad_factors_with_filter_ids,
)

print("Setup complete")

In [None]:
# =============================================================================
# Load and merge data (same pipeline as source_apportionment_regression.ipynb)
# =============================================================================

# Load factor contributions with Filter IDs
factors_df = load_etad_factors_with_filter_ids()

# Rename GF (fraction) columns
FACTOR_TO_FRAC = {
    'GF3 (Charcoal)':              'charcoal_frac',
    'GF2 (Wood Burning)':          'wood_frac',
    'GF5 (Fossil Fuel Combustion)':'fossil_fuel_frac',
    'GF4 (Polluted Marine)':       'polluted_marine_frac',
    'GF1 (Sea Salt Mixed)':        'sea_salt_frac',
}

# Rename K_F (concentration) columns
FACTOR_TO_CONC = {
    'K_F3 Charcoal (ug/m3)':              'charcoal_conc',
    'K_F2 Wood Burning (ug/m3)':           'wood_conc',
    'K_F5 Fossil Fuel Combustion (ug/m3)': 'fossil_fuel_conc',
    'K_F4 Polluted Marine (ug/m3)':        'polluted_marine_conc',
    'K_F1 Sea Salt Mixed (ug/m3)':         'sea_salt_conc',
}

factors_df = factors_df.rename(columns={**FACTOR_TO_FRAC, **FACTOR_TO_CONC})

# Load aethalometer + filter measurements and match by date
aethalometer_data = load_aethalometer_data()
filter_data = load_filter_data()
filter_data = add_base_filter_id(filter_data)

df_aeth = aethalometer_data.get('Addis_Ababa')
bc_df = match_all_parameters('Addis_Ababa', 'ETAD', df_aeth, filter_data)

# Merge BC/EC measurements with factor contributions via base_filter_id
etad_filters = filter_data[filter_data['Site'] == 'ETAD'][['SampleDate', 'FilterId']].drop_duplicates()
etad_filters = etad_filters.rename(columns={'SampleDate': 'date', 'FilterId': 'base_filter_id'})
bc_df['date'] = pd.to_datetime(bc_df['date'])
etad_filters['date'] = pd.to_datetime(etad_filters['date'])

bc_with_id = pd.merge(bc_df, etad_filters, on='date', how='left')

frac_cols = list(FACTOR_TO_FRAC.values())
conc_cols = list(FACTOR_TO_CONC.values())
factor_merge_cols = ['base_filter_id'] + frac_cols + conc_cols
df = pd.merge(bc_with_id, factors_df[factor_merge_cols].drop_duplicates(),
              on='base_filter_id', how='inner')

df['date'] = pd.to_datetime(df['date'])
df['Month'] = df['date'].dt.month

print(f"Dataset: {len(df)} samples")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"\nConcentration columns available:")
for col in conc_cols:
    vals = df[col].dropna()
    print(f"  {col}: n={len(vals)}, mean={vals.mean():.3f}, max={vals.max():.3f} µg/m³")

In [None]:
# =============================================================================
# Figure 7 configuration — colors and order matching Naveed's plot
# =============================================================================

# Stacking order: bottom to top
# Fossil Fuel → Polluted Marine → Sea Salt → Wood Burning → Charcoal
STACK_ORDER = [
    ('fossil_fuel_frac',    'Fossil Fuel Combustion', '#F8C8DC'),  # light pink
    ('polluted_marine_frac','Polluted Marine',        '#DC143C'),  # crimson red
    ('sea_salt_frac',       'Sea Salt Mixed',         '#0000CD'),  # medium blue
    ('wood_frac',           'Wood Burning',           '#90EE90'),  # light green
    ('charcoal_frac',       'Charcoal',               '#228B22'),  # forest green
]

# Ethiopian seasons (4-season classification matching Figure 7)
# First Dry (Bega end): Jan-Feb
# Less Rainy (Belg): Mar-May
# More Rainy (Kiremt): Jun-Sep
# Second Dry (Bega start): Oct-Dec
SEASON_DEFS = [
    ('First Dry',   [1, 2],          '#FFDAB9'),  # peach
    ('Less Rainy',  [3, 4, 5],       '#B0C4DE'),  # light steel blue
    ('More Rainy',  [6, 7, 8, 9],    '#87CEEB'),  # sky blue
    ('Second Dry',  [10, 11, 12],    '#F4A460'),  # sandy brown/orange
]

def get_season(month):
    for name, months, _ in SEASON_DEFS:
        if month in months:
            return name
    return None

print("Configuration ready")

In [None]:
# =============================================================================
# Figure 7: Daily contribution of sources to OM
# =============================================================================

# Prepare data
valid = df.dropna(subset=[col for col, _, _ in STACK_ORDER]).copy()
valid = valid.sort_values('date').reset_index(drop=True)
valid['season'] = valid['Month'].apply(get_season)

print(f"Plotting {len(valid)} samples from {valid['date'].min().date()} to {valid['date'].max().date()}")

# --- Create figure with two axes: season strip on top, bars below ---
fig, (ax_season, ax) = plt.subplots(
    2, 1, figsize=(18, 7),
    gridspec_kw={'height_ratios': [1, 20], 'hspace': 0.02},
    sharex=True
)

# Use date positions for x-axis (actual dates, not sequential index)
dates = valid['date'].values
# Calculate bar width as ~2 days in matplotlib date units
bar_width = 2.0
x_dates = mdates.date2num(dates)

# --- Draw stacked bars ---
bottom = np.zeros(len(valid))

for col, label, color in STACK_ORDER:
    values = valid[col].values
    ax.bar(x_dates, values, width=bar_width, bottom=bottom,
           color=color, label=label, edgecolor='white', linewidth=0.3)
    bottom += values

# --- Draw season strip at top ---
# For each consecutive group of dates in the same season, draw a colored rectangle
season_color_map = {name: color for name, _, color in SEASON_DEFS}

# Get date range for season coloring
date_min = x_dates.min() - bar_width
date_max = x_dates.max() + bar_width

# Draw season backgrounds as continuous strips
current_season = valid.iloc[0]['season']
seg_start = date_min

for i in range(1, len(valid)):
    s = valid.iloc[i]['season']
    if s != current_season:
        # Midpoint between this date and previous date
        boundary = (x_dates[i] + x_dates[i-1]) / 2
        ax_season.axvspan(seg_start, boundary,
                          color=season_color_map[current_season], alpha=0.9)
        seg_start = boundary
        current_season = s

# Final segment
ax_season.axvspan(seg_start, date_max,
                  color=season_color_map[current_season], alpha=0.9)

ax_season.set_xlim(date_min, date_max)
ax_season.set_yticks([])
ax_season.tick_params(bottom=False, labelbottom=False)
for spine in ax_season.spines.values():
    spine.set_visible(True)
    spine.set_linewidth(0.5)

# --- Format main axes ---
ax.set_xlim(date_min, date_max)
ax.set_ylim(0, 1.0)
ax.set_ylabel('Factor Fraction', fontsize=13)
ax.set_xlabel('Date', fontsize=13)

# Date tick formatting
ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=15, maxticks=30))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=9)

ax.yaxis.set_major_locator(plt.MultipleLocator(0.25))
ax.grid(False)

# Thin spines
for spine in ax.spines.values():
    spine.set_linewidth(0.5)

# --- Source legend (below plot) ---
source_handles = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color, edgecolor='black', linewidth=0.5)
    for _, label, color in STACK_ORDER
]
source_labels = [label for _, label, _ in STACK_ORDER]

source_legend = ax.legend(
    source_handles, source_labels,
    loc='upper center', bbox_to_anchor=(0.5, -0.22),
    ncol=5, fontsize=10, frameon=True,
    edgecolor='black', fancybox=False
)

# --- Season legend (below source legend) ---
season_handles = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color, edgecolor='black', linewidth=0.5)
    for name, _, color in SEASON_DEFS
]
season_labels = [name for name, _, _ in SEASON_DEFS]

season_legend = fig.legend(
    season_handles, season_labels,
    loc='upper center', bbox_to_anchor=(0.5, 0.06),
    ncol=4, fontsize=10, frameon=True,
    edgecolor='black', fancybox=False
)
ax.add_artist(source_legend)

# --- Title ---
fig.suptitle('Figure 7: Daily contribution of sources to OM in Addis Ababa in 2023.',
             fontsize=14, fontweight='bold', y=0.01)

plt.tight_layout(rect=[0, 0.12, 1, 1])

# Save
os.makedirs('output/plots/addis_ababa/source_regression', exist_ok=True)
fig.savefig('output/plots/addis_ababa/source_regression/figure7_source_contributions_om.png',
            dpi=200, bbox_inches='tight', facecolor='white')
plt.show()

print("\nFigure saved to output/plots/addis_ababa/source_regression/figure7_source_contributions_om.png")

## Normalized Version (Fractions sum to 1.0)

The raw GF fractions don't sum to 1. Below we normalize each day's fractions
so they sum to 1.0, matching Figure 7 where every bar reaches the top.

In [None]:
# =============================================================================
# Verification: spot-check a few days against Naveed's plot
# =============================================================================

print("Spot-check: source fractions for selected dates")
print("=" * 90)
print(f"{'Date':<14s} {'Fossil Fuel':>12s} {'Polluted Mar':>12s} {'Sea Salt':>12s} {'Wood Burn':>12s} {'Charcoal':>12s} {'Total':>8s}")
print("-" * 90)

check_dates = valid.iloc[::10]  # every 10th sample
for _, row in check_dates.iterrows():
    total = sum(row[col] for col, _, _ in STACK_ORDER)
    print(f"{str(row['date'].date()):<14s}"
          f" {row['fossil_fuel_frac']:>12.3f}"
          f" {row['polluted_marine_frac']:>12.3f}"
          f" {row['sea_salt_frac']:>12.3f}"
          f" {row['wood_frac']:>12.3f}"
          f" {row['charcoal_frac']:>12.3f}"
          f" {total:>8.3f}")

print("\nOverall means:")
for col, label, _ in STACK_ORDER:
    vals = valid[col]
    print(f"  {label:<25s}: mean={vals.mean():.3f}, min={vals.min():.3f}, max={vals.max():.3f}")

In [None]:
# =============================================================================
# Figure 7 (Normalized): Daily contribution of sources to OM — fractions sum to 1
# =============================================================================

# Prepare data
frac_col_names = [col for col, _, _ in STACK_ORDER]
valid_norm = df.dropna(subset=frac_col_names).copy()
valid_norm = valid_norm.sort_values('date').reset_index(drop=True)
valid_norm['season'] = valid_norm['Month'].apply(get_season)

# Normalize: divide each fraction by the row total so they sum to 1.0
row_totals = valid_norm[frac_col_names].sum(axis=1)
for col in frac_col_names:
    valid_norm[col + '_norm'] = valid_norm[col] / row_totals

# Verify normalization
norm_cols = [col + '_norm' for col in frac_col_names]
print(f"Row sums after normalization — min: {valid_norm[norm_cols].sum(axis=1).min():.4f}, "
      f"max: {valid_norm[norm_cols].sum(axis=1).max():.4f}")
print(f"Plotting {len(valid_norm)} samples from {valid_norm['date'].min().date()} to {valid_norm['date'].max().date()}")

# --- Create figure with two axes: season strip on top, bars below ---
fig, (ax_season, ax) = plt.subplots(
    2, 1, figsize=(18, 7),
    gridspec_kw={'height_ratios': [1, 20], 'hspace': 0.02},
    sharex=True
)

# Use date positions for x-axis (actual dates, not sequential index)
dates = valid_norm['date'].values
bar_width = 2.0
x_dates = mdates.date2num(dates)

# --- Draw stacked bars (normalized) ---
bottom = np.zeros(len(valid_norm))

for col, label, color in STACK_ORDER:
    values = valid_norm[col + '_norm'].values
    ax.bar(x_dates, values, width=bar_width, bottom=bottom,
           color=color, label=label, edgecolor='white', linewidth=0.3)
    bottom += values

# --- Draw season strip at top ---
season_color_map = {name: color for name, _, color in SEASON_DEFS}

date_min = x_dates.min() - bar_width
date_max = x_dates.max() + bar_width

current_season = valid_norm.iloc[0]['season']
seg_start = date_min

for i in range(1, len(valid_norm)):
    s = valid_norm.iloc[i]['season']
    if s != current_season:
        boundary = (x_dates[i] + x_dates[i-1]) / 2
        ax_season.axvspan(seg_start, boundary,
                          color=season_color_map[current_season], alpha=0.9)
        seg_start = boundary
        current_season = s

ax_season.axvspan(seg_start, date_max,
                  color=season_color_map[current_season], alpha=0.9)

ax_season.set_xlim(date_min, date_max)
ax_season.set_yticks([])
ax_season.tick_params(bottom=False, labelbottom=False)
for spine in ax_season.spines.values():
    spine.set_visible(True)
    spine.set_linewidth(0.5)

# --- Format main axes ---
ax.set_xlim(date_min, date_max)
ax.set_ylim(0, 1.0)
ax.set_ylabel('Factor Fraction', fontsize=13)
ax.set_xlabel('Date', fontsize=13)

ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=15, maxticks=30))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=9)

ax.yaxis.set_major_locator(plt.MultipleLocator(0.25))
ax.grid(False)

for spine in ax.spines.values():
    spine.set_linewidth(0.5)

# --- Source legend ---
source_handles = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color, edgecolor='black', linewidth=0.5)
    for _, label, color in STACK_ORDER
]
source_labels = [label for _, label, _ in STACK_ORDER]

source_legend = ax.legend(
    source_handles, source_labels,
    loc='upper center', bbox_to_anchor=(0.5, -0.22),
    ncol=5, fontsize=10, frameon=True,
    edgecolor='black', fancybox=False
)

# --- Season legend ---
season_handles = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color, edgecolor='black', linewidth=0.5)
    for name, _, color in SEASON_DEFS
]
season_labels = [name for name, _, _ in SEASON_DEFS]

season_legend = fig.legend(
    season_handles, season_labels,
    loc='upper center', bbox_to_anchor=(0.5, 0.06),
    ncol=4, fontsize=10, frameon=True,
    edgecolor='black', fancybox=False
)
ax.add_artist(source_legend)

# --- Title ---
fig.suptitle('Figure 7: Daily contribution of sources to OM in Addis Ababa in 2023.',
             fontsize=14, fontweight='bold', y=0.01)

plt.tight_layout(rect=[0, 0.12, 1, 1])

# Save
fig.savefig('output/plots/addis_ababa/source_regression/figure7_source_contributions_om_normalized.png',
            dpi=200, bbox_inches='tight', facecolor='white')
plt.show()

print("\nSaved: figure7_source_contributions_om_normalized.png")

## Concentration Version (K_F values in µg/m³)

Same stacking order and season strip, but using the K_F concentration columns
instead of GF fractions. Y-axis shows source contribution in µg/m³.

In [None]:
# =============================================================================
# Source Contributions by Concentration (K_F in µg/m³)
# =============================================================================

# Concentration stacking order (same source order, using _conc columns)
CONC_STACK_ORDER = [
    ('fossil_fuel_conc',    'Fossil Fuel Combustion', '#F8C8DC'),
    ('polluted_marine_conc','Polluted Marine',        '#DC143C'),
    ('sea_salt_conc',       'Sea Salt Mixed',         '#0000CD'),
    ('wood_conc',           'Wood Burning',           '#90EE90'),
    ('charcoal_conc',       'Charcoal',               '#228B22'),
]

conc_col_names = [col for col, _, _ in CONC_STACK_ORDER]
valid_conc = df.dropna(subset=conc_col_names).copy()
valid_conc = valid_conc.sort_values('date').reset_index(drop=True)
valid_conc['season'] = valid_conc['Month'].apply(get_season)

print(f"Plotting {len(valid_conc)} samples from {valid_conc['date'].min().date()} to {valid_conc['date'].max().date()}")

# --- Create figure ---
fig, (ax_season, ax) = plt.subplots(
    2, 1, figsize=(18, 7),
    gridspec_kw={'height_ratios': [1, 20], 'hspace': 0.02},
    sharex=True
)

dates = valid_conc['date'].values
bar_width = 2.0
x_dates = mdates.date2num(dates)

# --- Draw stacked bars (concentrations) ---
bottom = np.zeros(len(valid_conc))

for col, label, color in CONC_STACK_ORDER:
    values = valid_conc[col].values
    ax.bar(x_dates, values, width=bar_width, bottom=bottom,
           color=color, label=label, edgecolor='white', linewidth=0.3)
    bottom += values

# --- Draw season strip at top ---
season_color_map = {name: color for name, _, color in SEASON_DEFS}

date_min = x_dates.min() - bar_width
date_max = x_dates.max() + bar_width

current_season = valid_conc.iloc[0]['season']
seg_start = date_min

for i in range(1, len(valid_conc)):
    s = valid_conc.iloc[i]['season']
    if s != current_season:
        boundary = (x_dates[i] + x_dates[i-1]) / 2
        ax_season.axvspan(seg_start, boundary,
                          color=season_color_map[current_season], alpha=0.9)
        seg_start = boundary
        current_season = s

ax_season.axvspan(seg_start, date_max,
                  color=season_color_map[current_season], alpha=0.9)

ax_season.set_xlim(date_min, date_max)
ax_season.set_yticks([])
ax_season.tick_params(bottom=False, labelbottom=False)
for spine in ax_season.spines.values():
    spine.set_visible(True)
    spine.set_linewidth(0.5)

# --- Format main axes ---
ax.set_xlim(date_min, date_max)
ax.set_ylabel('Source Concentration (µg/m³)', fontsize=13)
ax.set_xlabel('Date', fontsize=13)

ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=15, maxticks=30))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=9)

ax.grid(False)

for spine in ax.spines.values():
    spine.set_linewidth(0.5)

# --- Source legend ---
source_handles = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color, edgecolor='black', linewidth=0.5)
    for _, label, color in CONC_STACK_ORDER
]
source_labels = [label for _, label, _ in CONC_STACK_ORDER]

source_legend = ax.legend(
    source_handles, source_labels,
    loc='upper center', bbox_to_anchor=(0.5, -0.22),
    ncol=5, fontsize=10, frameon=True,
    edgecolor='black', fancybox=False
)

# --- Season legend ---
season_handles = [
    plt.Rectangle((0, 0), 1, 1, facecolor=color, edgecolor='black', linewidth=0.5)
    for name, _, color in SEASON_DEFS
]
season_labels = [name for name, _, _ in SEASON_DEFS]

season_legend = fig.legend(
    season_handles, season_labels,
    loc='upper center', bbox_to_anchor=(0.5, 0.06),
    ncol=4, fontsize=10, frameon=True,
    edgecolor='black', fancybox=False
)
ax.add_artist(source_legend)

# --- Title ---
fig.suptitle('Daily source contributions to OM in Addis Ababa in 2023 (Concentration)',
             fontsize=14, fontweight='bold', y=0.01)

plt.tight_layout(rect=[0, 0.12, 1, 1])

# Save
fig.savefig('output/plots/addis_ababa/source_regression/source_contributions_concentration.png',
            dpi=200, bbox_inches='tight', facecolor='white')
plt.show()

print("\nSaved: source_contributions_concentration.png")

# Print concentration stats
print("\nConcentration stats per source:")
print("=" * 70)
for col, label, _ in CONC_STACK_ORDER:
    vals = valid_conc[col]
    print(f"  {label:<25s}: mean={vals.mean():.3f}, max={vals.max():.3f} µg/m³")
total_conc = valid_conc[conc_col_names].sum(axis=1)
print(f"\n  {'Total OM':<25s}: mean={total_conc.mean():.3f}, max={total_conc.max():.3f} µg/m³")