# Addis Ababa: Source Apportionment Analysis

This notebook covers AAE calculations and source attribution for the Addis Ababa site.

## Tasks Covered:
1. **Calculate Absorption Ångström Exponent (AAE)** using BCc measurements
2. **Source attribution** (Fossil Fuel, Mixed, Biomass)
3. **MAC sensitivity analysis** - test different MAC value combinations
4. **Seasonal source patterns** using Ethiopian seasons

---

## Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Add scripts folder to path (reuse existing infrastructure)
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

# Import from existing modules where available
try:
    from config import SITES, MAC_VALUE
    print(f"Loaded config - MAC value: {MAC_VALUE} m²/g")
except ImportError:
    print("Config not found - using defaults")
    MAC_VALUE = 7.77  # Default MAC at 880nm

try:
    from plotting import PlotConfig, calculate_regression_stats
    print("Loaded plotting utilities")
except ImportError:
    print("Plotting utilities not found - will define inline")

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

# Create output directories
def setup_directories():
    dirs = {
        'plots': 'output/plots/addis_ababa',
        'data': 'output/data/addis_ababa'
    }
    for dir_path in dirs.values():
        os.makedirs(dir_path, exist_ok=True)
    return dirs

dirs = setup_directories()
print("Setup complete!")

## Site Configuration

Addis Ababa-specific settings for this analysis.

In [None]:
# Addis Ababa site configuration
ADDIS_CONFIG = {
    'name': 'Addis_Ababa',
    'code': 'AA',
    'timezone': 'Africa/Addis_Ababa',
    'wavelengths': {'UV': 375, 'Blue': 470, 'Green': 528, 'Red': 625, 'IR': 880},
    'bc_columns': [
        'UV BC1', 'UV BC2', 'UV BCc',
        'Blue BC1', 'Blue BC2', 'Blue BCc',
        'Green BC1', 'Green BC2', 'Green BCc',
        'Red BC1', 'Red BC2', 'Red BCc',
        'IR BC1', 'IR BC2', 'IR BCc'
    ],
    # AAE thresholds for source attribution
    'aae_thresholds': {
        'fossil_fuel_max': 0.9,
        'biomass_min': 1.5
    },
    # Ethiopian seasons (month-based)
    'seasons': {
        'Dry Season': [10, 11, 12, 1, 2],
        'Belg Rainy Season': [3, 4, 5],
        'Kiremt Rainy Season': [6, 7, 8, 9]
    }
}

# MAC value combinations for sensitivity analysis
MAC_COMBINATIONS = [
    (7.77, 7.77),   # Equal MACs
    (10.0, 7.77),   # Higher UV
    (12.5, 7.77),   # Standard literature value
    (15.0, 7.77),   # Much higher UV
    (12.5, 8.5),    # Both adjusted
    (15.0, 9.0)     # Both high
]

print(f"Site: {ADDIS_CONFIG['name']}")
print(f"Wavelengths: {ADDIS_CONFIG['wavelengths']}")
print(f"AAE Thresholds: {ADDIS_CONFIG['aae_thresholds']}")

## Data Loading

In [None]:
DATA_FILEPATH = "/Users/ahmadjalil/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data/Aethelometry Data/JacrosMA350 60s Data20250804082112/df_jacros_cleaned_API_and_OG_manual_BC_all_wl.pkl"

def load_aethalometer_addis(filepath):
    """
    Load and preprocess Addis Ababa aethalometer data from pickle file.
    
    Parameters:
    -----------
    filepath : str
        Path to the pickle file
        
    Returns:
    --------
    df : DataFrame
        Processed aethalometer data with datetime index
    """
    df = pd.read_pickle(filepath)
    
    # Set datetime index
    df['datetime_local'] = pd.to_datetime(df['datetime_local'])
    df.set_index('datetime_local', inplace=True)
    df.sort_index(inplace=True)
    
    # Convert from ng/m³ to µg/m³ for all BC columns
    bc_columns = ADDIS_CONFIG['bc_columns']
    for col in bc_columns:
        if col in df.columns:
            df[col] = df[col] / 1000
        else:
            print(f"Warning: Column {col} not found - skipping")
    
    # Add time-based columns
    df['Month'] = df.index.month
    df['Hour'] = df.index.hour
    df['DayOfWeek'] = df.index.dayofweek
    
    # Add Ethiopian seasons
    df['Ethiopian_Season'] = df['Month'].map(lambda m: 
        'Dry Season' if m in ADDIS_CONFIG['seasons']['Dry Season'] else
        'Belg Rainy Season' if m in ADDIS_CONFIG['seasons']['Belg Rainy Season'] else
        'Kiremt Rainy Season'
    )
    
    # Handle outliers and negative values
    for col in bc_columns:
        if col in df.columns:
            # Remove negative values
            df.loc[df[col] < 0, col] = np.nan
            
            # Remove extreme outliers (3 std dev)
            mean = df[col].mean()
            std = df[col].std()
            upper_limit = mean + 3 * std
            df.loc[df[col] > upper_limit, col] = np.nan
    
    return df

df = load_aethalometer_addis(DATA_FILEPATH)
print(f"Loaded {len(df):,} records")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"BC columns available: {[c for c in ADDIS_CONFIG['bc_columns'] if c in df.columns]}")

---

# Task 1: Calculate AAE with Data Quality Filters

**Goal**: Calculate the Absorption Ångström Exponent (AAE) using BCc measurements with proper filtering.

In [None]:
def calculate_aae(df, uv_col='UV BCc', ir_col='IR BCc', clip_range=(-1, 3)):
    """
    Calculate AAE with data quality filters.
    
    Parameters:
    -----------
    df : DataFrame
        Data with UV and IR BC columns
    uv_col : str
        Column name for UV BC
    ir_col : str
        Column name for IR BC
    clip_range : tuple
        (min, max) range to clip AAE values
        
    Returns:
    --------
    aae : Series
        AAE values
    valid_mask : Series
        Boolean mask of valid measurements
    """
    # Wavelength ratio
    wavelength_ratio = np.log(880 / 375)
    
    # Basic validity filter
    valid_mask = (df[uv_col] > 0) & (df[ir_col] > 0)
    
    # Calculate BC ratio (IR/UV)
    bc_ratio = df.loc[valid_mask, ir_col] / df.loc[valid_mask, uv_col]
    
    # Filter extreme ratios (physical limits)
    ratio_mask = (bc_ratio > 0.1) & (bc_ratio < 10)
    bc_ratio = bc_ratio[ratio_mask]
    
    # Calculate AAE
    aae = pd.Series(index=df.index, dtype=float)
    valid_indices = bc_ratio.index
    aae.loc[valid_indices] = np.log(bc_ratio) / wavelength_ratio
    
    # Clip to physical range
    aae = aae.clip(clip_range[0], clip_range[1])
    
    return aae, valid_mask


def calculate_source_attribution(aae, thresholds=None):
    """
    Calculate source contributions from AAE values.
    
    Parameters:
    -----------
    aae : Series
        AAE values
    thresholds : dict
        {'fossil_fuel_max': float, 'biomass_min': float}
        
    Returns:
    --------
    dict with source percentages
    """
    if thresholds is None:
        thresholds = ADDIS_CONFIG['aae_thresholds']
    
    valid_aae = aae.dropna()
    total = len(valid_aae)
    
    if total == 0:
        return {'Fossil Fuel %': 0, 'Mixed %': 0, 'Biomass %': 0, 'n': 0}
    
    ff_max = thresholds['fossil_fuel_max']
    bb_min = thresholds['biomass_min']
    
    return {
        'Fossil Fuel %': (valid_aae < ff_max).sum() / total * 100,
        'Mixed %': ((valid_aae >= ff_max) & (valid_aae <= bb_min)).sum() / total * 100,
        'Biomass %': (valid_aae > bb_min).sum() / total * 100,
        'n': total,
        'mean_aae': valid_aae.mean(),
        'median_aae': valid_aae.median(),
        'std_aae': valid_aae.std()
    }

print("AAE calculation functions defined.")

In [None]:
print("="*80)
print("TASK 1: AAE CALCULATION")
print("="*80)

aae, valid_mask = calculate_aae(df)
source_contrib = calculate_source_attribution(aae)

print(f"\nAAE Statistics:")
print(f"  Valid measurements: {source_contrib['n']:,}")
print(f"  Mean AAE: {source_contrib['mean_aae']:.3f}")
print(f"  Median AAE: {source_contrib['median_aae']:.3f}")
print(f"  Std AAE: {source_contrib['std_aae']:.3f}")

print(f"\nSource Attribution:")
print(f"  Fossil Fuel (AAE < {ADDIS_CONFIG['aae_thresholds']['fossil_fuel_max']}): {source_contrib['Fossil Fuel %']:.1f}%")
print(f"  Mixed: {source_contrib['Mixed %']:.1f}%")
print(f"  Biomass (AAE > {ADDIS_CONFIG['aae_thresholds']['biomass_min']}): {source_contrib['Biomass %']:.1f}%")

---

# Task 2: AAE Distribution Visualization

**Goal**: Visualize AAE distribution with source region annotations.

In [None]:
def plot_aae_distribution(aae, thresholds=None, title_suffix=""):
    """
    Plot AAE distribution with source region annotations.
    """
    if thresholds is None:
        thresholds = ADDIS_CONFIG['aae_thresholds']
    
    valid_aae = aae.dropna()
    ff_max = thresholds['fossil_fuel_max']
    bb_min = thresholds['biomass_min']
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Histogram
    ax.hist(valid_aae, bins=100, alpha=0.7, density=True, color='steelblue', edgecolor='black', linewidth=0.5)
    
    # Threshold lines
    ax.axvline(x=ff_max, color='r', linestyle='--', linewidth=2, label=f'FF Threshold ({ff_max})')
    ax.axvline(x=bb_min, color='g', linestyle='--', linewidth=2, label=f'BB Threshold ({bb_min})')
    
    # Source regions
    ax.axvspan(-1, ff_max, alpha=0.15, color='blue', label='Fossil Fuel Region')
    ax.axvspan(ff_max, bb_min, alpha=0.15, color='gray', label='Mixed Region')
    ax.axvspan(bb_min, 3, alpha=0.15, color='green', label='Biomass Region')
    
    # Stats annotation
    stats_text = f"n = {len(valid_aae):,}\nMean = {valid_aae.mean():.3f}\nMedian = {valid_aae.median():.3f}"
    ax.text(0.95, 0.95, stats_text, transform=ax.transAxes, fontsize=10,
            va='top', ha='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
    
    ax.set_xlabel('Absorption Ångström Exponent (AAE)', fontsize=12)
    ax.set_ylabel('Normalized Frequency', fontsize=12)
    ax.set_title(f'AAE Distribution with Source Regions{title_suffix}', fontsize=14, fontweight='bold')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_xlim(-1, 3)
    
    plt.tight_layout()
    return fig

fig = plot_aae_distribution(aae)
plt.savefig(os.path.join(dirs['plots'], 'aae_distribution.png'), dpi=150, bbox_inches='tight')
plt.show()

---

# Task 3: MAC Sensitivity Analysis

**Goal**: Test how different MAC value combinations affect AAE calculations and source attribution.

In [None]:
def analyze_aae_mac_sensitivity(df, mac_combinations, uv_col='UV BCc', ir_col='IR BCc'):
    """
    Analyze AAE with different MAC value combinations.
    """
    wavelength_ratio = np.log(880 / 375)
    results = {}
    
    for uv_mac, ir_mac in mac_combinations:
        # Calculate absorption
        df_temp = df.copy()
        df_temp['UV_abs'] = df_temp[uv_col] * uv_mac
        df_temp['IR_abs'] = df_temp[ir_col] * ir_mac
        
        # Valid data mask
        valid_mask = (df_temp['UV_abs'] > 0) & (df_temp['IR_abs'] > 0)
        
        # Calculate ratio and AAE
        ratio = df_temp.loc[valid_mask, 'IR_abs'] / df_temp.loc[valid_mask, 'UV_abs']
        aae = pd.Series(np.log(ratio) / wavelength_ratio).clip(-1, 3)
        
        # Source attribution
        source = calculate_source_attribution(aae)
        
        results[(uv_mac, ir_mac)] = {
            'aae': aae,
            'source_attribution': source
        }
        
        print(f"\nMAC UV={uv_mac}, IR={ir_mac}:")
        print(f"  Mean AAE: {source['mean_aae']:.3f}")
        print(f"  FF: {source['Fossil Fuel %']:.1f}%, Mixed: {source['Mixed %']:.1f}%, BB: {source['Biomass %']:.1f}%")
    
    return results


def plot_mac_sensitivity(results):
    """
    Create visualization comparing different MAC combinations.
    """
    n_combos = len(results)
    n_rows = (n_combos + 1) // 2
    
    fig, axes = plt.subplots(n_rows, 2, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_combos == 1 else axes
    
    thresholds = ADDIS_CONFIG['aae_thresholds']
    
    for idx, ((uv_mac, ir_mac), data) in enumerate(results.items()):
        ax = axes[idx]
        aae = data['aae']
        
        ax.hist(aae.dropna(), bins=100, alpha=0.7, density=True)
        ax.axvline(x=thresholds['fossil_fuel_max'], color='r', linestyle='--', label='FF Threshold')
        ax.axvline(x=thresholds['biomass_min'], color='g', linestyle='--', label='BB Threshold')
        
        source = data['source_attribution']
        stats_text = f"n={source['n']:,}\nAAE={source['mean_aae']:.2f}\nFF:{source['Fossil Fuel %']:.0f}% BB:{source['Biomass %']:.0f}%"
        ax.text(0.95, 0.95, stats_text, transform=ax.transAxes, fontsize=9,
                va='top', ha='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.9))
        
        ax.set_title(f'UV MAC={uv_mac}, IR MAC={ir_mac}', fontweight='bold')
        ax.set_xlabel('AAE')
        ax.set_ylabel('Normalized Frequency')
        ax.grid(True, alpha=0.3)
        ax.legend(fontsize=8)
    
    # Hide empty subplots
    for idx in range(len(results), len(axes)):
        axes[idx].set_visible(False)
    
    plt.suptitle('AAE Distributions with Different MAC Values', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    return fig

print("="*80)
print("TASK 3: MAC SENSITIVITY ANALYSIS")
print("="*80)
mac_results = analyze_aae_mac_sensitivity(df, MAC_COMBINATIONS)
fig = plot_mac_sensitivity(mac_results)
plt.savefig(os.path.join(dirs['plots'], 'mac_sensitivity.png'), dpi=150, bbox_inches='tight')
plt.show()

---

# Task 4: Seasonal Source Apportionment

**Goal**: Analyze how source contributions vary across Ethiopian seasons.

In [None]:
def analyze_seasonal_sources(df, aae, seasons_col='Ethiopian_Season'):
    """
    Calculate source attribution by season.
    """
    seasons_order = ['Dry Season', 'Belg Rainy Season', 'Kiremt Rainy Season']
    
    df_temp = df.copy()
    df_temp['AAE'] = aae
    
    results = {}
    
    print("\nSeasonal Source Attribution:")
    print("=" * 60)
    
    for season in seasons_order:
        season_mask = df_temp[seasons_col] == season
        season_aae = df_temp.loc[season_mask, 'AAE']
        
        source = calculate_source_attribution(season_aae)
        results[season] = source
        
        print(f"\n{season}:")
        print(f"  n = {source['n']:,}")
        print(f"  Mean AAE = {source['mean_aae']:.3f}")
        print(f"  Fossil Fuel: {source['Fossil Fuel %']:.1f}%")
        print(f"  Mixed: {source['Mixed %']:.1f}%")
        print(f"  Biomass: {source['Biomass %']:.1f}%")
    
    return results


def plot_seasonal_sources(seasonal_results):
    """
    Create stacked bar chart of source contributions by season.
    """
    seasons = list(seasonal_results.keys())
    
    ff_pct = [seasonal_results[s]['Fossil Fuel %'] for s in seasons]
    mixed_pct = [seasonal_results[s]['Mixed %'] for s in seasons]
    bb_pct = [seasonal_results[s]['Biomass %'] for s in seasons]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    x = np.arange(len(seasons))
    width = 0.6
    
    ax.bar(x, ff_pct, width, label='Fossil Fuel', color='#1f77b4', alpha=0.8)
    ax.bar(x, mixed_pct, width, bottom=ff_pct, label='Mixed', color='#7f7f7f', alpha=0.8)
    ax.bar(x, bb_pct, width, bottom=np.array(ff_pct) + np.array(mixed_pct), 
           label='Biomass', color='#2ca02c', alpha=0.8)
    
    ax.set_xlabel('Ethiopian Season', fontsize=12)
    ax.set_ylabel('Percentage (%)', fontsize=12)
    ax.set_title('BC Source Contributions by Ethiopian Season', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(seasons, rotation=15)
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3, axis='y')
    ax.set_ylim(0, 100)
    
    # Add count annotations
    for i, season in enumerate(seasons):
        n = seasonal_results[season]['n']
        ax.text(i, 102, f'n={n:,}', ha='center', fontsize=9)
    
    plt.tight_layout()
    return fig

print("="*80)
print("TASK 4: SEASONAL SOURCE APPORTIONMENT")
print("="*80)
seasonal_sources = analyze_seasonal_sources(df, aae)
fig = plot_seasonal_sources(seasonal_sources)
plt.savefig(os.path.join(dirs['plots'], 'seasonal_sources.png'), dpi=150, bbox_inches='tight')
plt.show()

---

# Task 5: AAE Time Series

**Goal**: Visualize AAE variation over time with source thresholds.

In [None]:
def plot_aae_timeseries(df, aae):
    """
    Plot AAE time series with source threshold lines.
    """
    thresholds = ADDIS_CONFIG['aae_thresholds']
    
    fig, ax = plt.subplots(figsize=(15, 6))
    
    ax.plot(aae.index, aae, alpha=0.5, linewidth=0.5, color='steelblue')
    
    # Rolling average
    rolling_aae = aae.rolling(window=24*7, min_periods=24).mean()  # 7-day rolling
    ax.plot(rolling_aae.index, rolling_aae, color='darkblue', linewidth=2, label='7-day rolling mean')
    
    # Thresholds
    ax.axhline(y=thresholds['fossil_fuel_max'], color='r', linestyle='--', 
               linewidth=2, label=f'FF Threshold ({thresholds["fossil_fuel_max"]})')
    ax.axhline(y=thresholds['biomass_min'], color='g', linestyle='--', 
               linewidth=2, label=f'BB Threshold ({thresholds["biomass_min"]})')
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Absorption Ångström Exponent (AAE)', fontsize=12)
    ax.set_title('AAE Time Series with Source Thresholds', fontsize=14, fontweight='bold')
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

fig = plot_aae_timeseries(df, aae)
plt.savefig(os.path.join(dirs['plots'], 'aae_timeseries.png'), dpi=150, bbox_inches='tight')
plt.show()

---

# Summary

## Functions Defined:
- `load_aethalometer_addis()` - Load and preprocess Addis Ababa data
- `calculate_aae()` - Calculate AAE with quality filters
- `calculate_source_attribution()` - Attribute sources from AAE
- `plot_aae_distribution()` - Visualize AAE distribution
- `analyze_aae_mac_sensitivity()` - Test MAC sensitivity
- `analyze_seasonal_sources()` - Seasonal source patterns
- `plot_aae_timeseries()` - Time series visualization

## To Run This Notebook:
1. Update the `filepath` variable with your data path
2. Uncomment the data loading and analysis cells
3. Run all cells

In [None]:
print("="*80)
print("NOTEBOOK COMPLETE")
print("="*80)
print("\nTo run this analysis:")
print("1. Update 'filepath' with your Addis Ababa aethalometer data path")
print("2. Uncomment the analysis cells")
print("3. Run all cells")