In [1]:
# %%
# Cell 1: Setup and Configuration
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

from config.notebook_config import NotebookConfig
from notebook_utils.pkl_cleaning_integration import create_enhanced_setup

# Your existing configuration
config = NotebookConfig(
    site_code='ETAD',
    wavelength='Red',
    quality_threshold=10,
    output_format='jpl',
    min_samples_for_analysis=30,
    confidence_level=0.95,
    outlier_threshold=3.0,
    figure_size=(12, 8),
    font_size=10,
    dpi=300
)

# Add timezone to config for proper 9am-to-9am handling
config.timezone = 'Africa/Addis_Ababa'

# Set your data paths
base_data_path = "/Users/ahzs645/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data"

config.aethalometer_files = {
    'pkl_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Kyan Data/Mergedcleaned and uncleaned MA350 data20250707030704",
        "df_uncleaned_Jacros_API_and_OG.pkl"
    ),
    'csv_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Raw",
        "Jacros_MA350_1-min_2022-2024_Cleaned.csv"
    )
}

config.ftir_db_path = os.path.join(
    base_data_path,
    "EC-HIPS-Aeth Comparison/Data/Original Data/Combined Database",
    "spartan_ftir_hips.db"
)

# Create enhanced setup
setup = create_enhanced_setup(config)

print("✅ Configuration and setup complete!")

✅ Advanced plotting style configured
🚀 Aethalometer-FTIR/HIPS Pipeline with Simplified Setup
📊 Configuration Summary:
   Site: ETAD
   Wavelength: Red
   Output format: jpl
   Quality threshold: 10 minutes
   Output directory: outputs

📁 File paths:
   pkl_data: ✅ df_uncleaned_Jacros_API_and_OG.pkl
   csv_data: ✅ Jacros_MA350_1-min_2022-2024_Cleaned.csv
   FTIR DB: ✅ spartan_ftir_hips.db
🧹 Enhanced setup with PKL cleaning and FTIR CSV capabilities loaded
✅ Configuration and setup complete!


In [None]:
# %%
# Cell 2: Import and Initialize Dual Dataset Processor
from data.processors.dual_dataset_pipeline import DualDatasetProcessor, run_dual_dataset_processing

# 🎛️ Configuration: Toggle Ethiopia fix here
APPLY_ETHIOPIA_FIX = True  # Set to True to enable Ethiopia pneumatic pump fix

print(f"🚀 DUAL-DATASET PROCESSING {'WITH' if APPLY_ETHIOPIA_FIX else 'WITHOUT'} Ethiopia Fix")
print("=" * 70)
print("")
print("📊 This pipeline will create two complementary datasets:")
print("   1. High-resolution: All periods, full temporal resolution + DEMA")
print("   2. FTIR-matched: Matched periods only, 9am-to-9am averaged + DEMA")
print("")

# Initialize the processor
processor = DualDatasetProcessor(config, setup)
print("✅ Processor initialized")

In [None]:
# %%
# Cell 3: Run the Dual Dataset Processing
# This cell performs all processing steps and creates both datasets

# Run the dual dataset processing
datasets = processor.process_dual_datasets(
    apply_ethiopia_fix=APPLY_ETHIOPIA_FIX,
    save_outputs=True,
    output_dir='processed_data'
)

# Access your datasets
high_resolution_data = datasets['high_resolution']
ftir_matched_data = datasets['ftir_matched']
raw_data = datasets['raw_data']
cleaned_data = datasets['cleaned_data']
ftir_data = datasets['ftir_data']

print(f"\n🎉 Processing Complete!")
print(f"📈 High-resolution data: {high_resolution_data.shape}")
print(f"🔗 FTIR-matched data: {ftir_matched_data.shape if len(ftir_matched_data) > 0 else 'No matching periods found'}")

In [None]:
# %%
# Cell 4: Verify Processing Results
print("📊 PROCESSING VERIFICATION")
print("=" * 60)

# Check high-resolution dataset
print("\n📈 High-Resolution Dataset:")
print(f"  Shape: {high_resolution_data.shape}")
print(f"  Date range: {high_resolution_data['datetime_local'].min()} to {high_resolution_data['datetime_local'].max()}")

# Check for smoothed columns
smoothed_cols = [col for col in high_resolution_data.columns if 'smoothed' in col]
print(f"  DEMA smoothed columns: {len(smoothed_cols)}")
if smoothed_cols:
    print(f"    Examples: {', '.join(smoothed_cols[:3])}...")

# Check for Ethiopia corrections if applied
if APPLY_ETHIOPIA_FIX:
    ethiopia_cols = [col for col in high_resolution_data.columns 
                    if any(x in col for x in ['corrected', 'manual', 'optimized'])]
    print(f"  Ethiopia corrections: {len(ethiopia_cols)} columns")

# Check FTIR-matched dataset
if len(ftir_matched_data) > 0:
    print("\n🔗 FTIR-Matched Dataset:")
    print(f"  Shape: {ftir_matched_data.shape}")
    print(f"  Date range: {ftir_matched_data.index.min()} to {ftir_matched_data.index.max()}")
    
    # Check for FTIR columns
    ftir_cols = [col for col in ftir_matched_data.columns if 'ftir' in col.lower()]
    print(f"  FTIR columns: {ftir_cols}")
    
    # Show sample of matched data
    print("\n  Sample of matched data:")
    display_cols = ['IR BCc', 'Blue BCc', 'ec_ftir', 'oc_ftir']
    available_cols = [col for col in display_cols if col in ftir_matched_data.columns]
    if available_cols:
        print(ftir_matched_data[available_cols].head())
else:
    print("\n⚠️ No FTIR-matched data available (no overlapping periods found)")

In [None]:
# %%
# Cell 5: Quick Visualization of Both Datasets
import matplotlib.pyplot as plt
import pandas as pd

# Create comparison plots
fig, axes = plt.subplots(2, 1, figsize=(12, 8), sharex=False)

# Plot 1: High-resolution time series
ax1 = axes[0]
if 'IR BCc' in high_resolution_data.columns:
    # Sample data for faster plotting (every 100th point)
    sample_data = high_resolution_data.iloc[::100]
    ax1.plot(sample_data['datetime_local'], sample_data['IR BCc'], 
             alpha=0.5, label='IR BCc (raw)', linewidth=0.5)
    
    if 'IR BCc smoothed' in sample_data.columns:
        ax1.plot(sample_data['datetime_local'], sample_data['IR BCc smoothed'], 
                 label='IR BCc (DEMA smoothed)', linewidth=1)
    
    ax1.set_title('High-Resolution Dataset: IR Black Carbon Time Series')
    ax1.set_ylabel('BC (ng/m³)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

# Plot 2: FTIR-matched daily averages
ax2 = axes[1]
if len(ftir_matched_data) > 0 and 'IR BCc' in ftir_matched_data.columns:
    ax2.scatter(ftir_matched_data.index, ftir_matched_data['IR BCc'], 
                alpha=0.6, label='Aethalometer (9am-9am avg)', s=30)
    
    if 'ec_ftir' in ftir_matched_data.columns:
        # Convert FTIR EC to comparable units if needed
        ax2.scatter(ftir_matched_data.index, ftir_matched_data['ec_ftir'] * 1000,  # Convert µg to ng
                    alpha=0.6, label='FTIR EC', marker='s', s=30)
    
    ax2.set_title('FTIR-Matched Dataset: Daily Averages Comparison')
    ax2.set_xlabel('Date')
    ax2.set_ylabel('Carbon (ng/m³)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
else:
    ax2.text(0.5, 0.5, 'No FTIR-matched data available', 
             transform=ax2.transAxes, ha='center', va='center')
    ax2.set_title('FTIR-Matched Dataset: No Data')

plt.tight_layout()
plt.show()

# Summary statistics
print("\n📊 SUMMARY STATISTICS")
print("=" * 40)

if 'IR BCc' in high_resolution_data.columns:
    print("High-Resolution IR BCc:")
    print(f"  Mean: {high_resolution_data['IR BCc'].mean():.1f} ng/m³")
    print(f"  Std: {high_resolution_data['IR BCc'].std():.1f} ng/m³")
    print(f"  Valid points: {high_resolution_data['IR BCc'].notna().sum():,}")

if len(ftir_matched_data) > 0 and 'IR BCc' in ftir_matched_data.columns:
    print("\nFTIR-Matched IR BCc (daily):")
    print(f"  Mean: {ftir_matched_data['IR BCc'].mean():.1f} ng/m³")
    print(f"  Std: {ftir_matched_data['IR BCc'].std():.1f} ng/m³")
    print(f"  Matched days: {len(ftir_matched_data)}")

In [None]:
# %%
# Cell 6: Export Summary and Next Steps
print("📦 EXPORTED FILES")
print("=" * 50)
print("\nThe following files have been saved to 'processed_data/' directory:")
print(f"\n📈 High-Resolution Dataset:")
print(f"  • aethalometer_high_resolution_{config.site_code}.pkl")
print(f"  • aethalometer_high_resolution_{config.site_code}.csv")

if len(ftir_matched_data) > 0:
    print(f"\n🔗 FTIR-Matched Dataset:")
    print(f"  • aethalometer_ftir_matched_{config.site_code}.pkl")
    print(f"  • aethalometer_ftir_matched_{config.site_code}.csv")

print("\n🚀 NEXT STEPS")
print("=" * 50)
print("\n1. Use high-resolution data for:")
print("   • Time series analysis")
print("   • Diurnal patterns")
print("   • Event detection")
print("   • Source apportionment")

print("\n2. Use FTIR-matched data for:")
print("   • Method comparison (Aethalometer vs FTIR)")
print("   • Calibration development")
print("   • Bias assessment")
print("   • Long-term trends")

print("\n💡 Tips:")
print("  • Both datasets include DEMA-smoothed columns")
print("  • High-res data preserves full temporal resolution")
print("  • FTIR-matched uses 9am-to-9am daily averages")
if APPLY_ETHIOPIA_FIX:
    print("  • Ethiopia corrections have been applied to both datasets")

In [None]:
# %%
# Cell 7: Optional - Load and Use the Saved Datasets
# This shows how to load the pkl files in future sessions

import pandas as pd
import os

# Define paths to saved files
high_res_path = f"processed_data/aethalometer_high_resolution_{config.site_code}.pkl"
ftir_matched_path = f"processed_data/aethalometer_ftir_matched_{config.site_code}.pkl"

print("📁 Loading saved datasets...")

# Load high-resolution data
if os.path.exists(high_res_path):
    high_res_loaded = pd.read_pickle(high_res_path)
    print(f"✅ Loaded high-resolution data: {high_res_loaded.shape}")
else:
    print(f"❌ High-resolution file not found: {high_res_path}")

# Load FTIR-matched data
if os.path.exists(ftir_matched_path):
    ftir_matched_loaded = pd.read_pickle(ftir_matched_path)
    print(f"✅ Loaded FTIR-matched data: {ftir_matched_loaded.shape}")
else:
    print(f"❌ FTIR-matched file not found: {ftir_matched_path}")

print("\n💡 You can now use these loaded datasets for your analysis!")