# Dual Dataset Pipeline with FTIR CSV Data

This notebook implements the dual-dataset processing pipeline using the FTIR CSV file instead of the database.
It creates:
1. **High-resolution dataset**: All aethalometer data with DEMA smoothing
2. **FTIR-matched dataset**: Only matching periods with 9am-to-9am averaging and FTIR CSV data

In [None]:
# %%
# Cell 1: Setup and Configuration
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import warnings
warnings.filterwarnings('ignore')

from config.notebook_config import NotebookConfig
from notebook_utils.pkl_cleaning_integration import create_enhanced_setup
from data.processors.dual_dataset_pipeline import DualDatasetProcessor

# Configuration
config = NotebookConfig(
    site_code='ETAD',
    wavelength='Red',
    quality_threshold=10,
    output_format='jpl',
    min_samples_for_analysis=30,
    confidence_level=0.95,
    outlier_threshold=3.0,
    figure_size=(12, 8),
    font_size=10,
    dpi=300
)

# Add timezone to config for proper 9am-to-9am handling
config.timezone = 'Africa/Addis_Ababa'

# Set your data paths
base_data_path = "/Users/ahzs645/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data"

config.aethalometer_files = {
    'pkl_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Kyan Data/Mergedcleaned and uncleaned MA350 data20250707030704",
        "df_uncleaned_Jacros_API_and_OG.pkl"
    ),
    'csv_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Raw",
        "Jacros_MA350_1-min_2022-2024_Cleaned.csv"
    )
}

# FTIR CSV path
FTIR_CSV_PATH = "/Users/ahzs645/Github/aethmodular-clean/Four_Sites_FTIR_data.v2.csv"

# Create enhanced setup
setup = create_enhanced_setup(config)

print("✅ Configuration and setup complete!")
print(f"📄 FTIR CSV path: {FTIR_CSV_PATH}")

In [None]:
# %%
# Cell 2: FTIR CSV Loader Class
class FTIRCSVLoader:
    """Load and process FTIR data from CSV files"""
    
    def __init__(self, csv_path: str):
        self.csv_path = Path(csv_path)
        if not self.csv_path.exists():
            raise FileNotFoundError(f"FTIR CSV file not found: {self.csv_path}")
        
    def load_site_data(self, site_code: str, parameters=None) -> pd.DataFrame:
        """Load FTIR data for a specific site"""
        print(f"📊 Loading FTIR data for site {site_code}...")
        
        # Load the full CSV
        df = pd.read_csv(self.csv_path)
        
        # Filter by site
        site_data = df[df['Site'] == site_code].copy()
        
        if len(site_data) == 0:
            available_sites = df['Site'].unique()
            raise ValueError(f"No data found for site '{site_code}'. Available sites: {list(available_sites)}")
        
        # Convert sample date to datetime
        site_data['SampleDate'] = pd.to_datetime(site_data['SampleDate'])
        
        # Filter parameters if specified
        if parameters:
            site_data = site_data[site_data['Parameter'].isin(parameters)]
        
        # Pivot to get parameters as columns
        pivot_data = site_data.pivot_table(
            index='SampleDate',
            columns='Parameter',
            values='Concentration_ug_m3',
            aggfunc='mean'  # Average if multiple measurements per day
        ).reset_index()
        
        # Rename columns to match expected format
        pivot_data.rename(columns={
            'SampleDate': 'sample_date',
            'EC_ftir': 'ec_ftir',
            'OC_ftir': 'oc_ftir'
        }, inplace=True)
        
        # Add site_code column
        pivot_data['site_code'] = site_code
        
        print(f"✅ Loaded {len(pivot_data)} FTIR measurements")
        print(f"📅 Date range: {pivot_data['sample_date'].min()} to {pivot_data['sample_date'].max()}")
        print(f"🧪 Parameters: {[col for col in pivot_data.columns if col not in ['sample_date', 'site_code']]}")
        
        return pivot_data
    
    def get_available_sites(self):
        """Get list of available sites"""
        df = pd.read_csv(self.csv_path)
        return sorted(df['Site'].unique())
    
    def get_available_parameters(self, site_code=None):
        """Get list of available parameters"""
        df = pd.read_csv(self.csv_path)
        if site_code:
            df = df[df['Site'] == site_code]
        return sorted(df['Parameter'].unique())

# Initialize FTIR loader
ftir_loader = FTIRCSVLoader(FTIR_CSV_PATH)

print("✅ FTIR CSV loader initialized")
print("Available sites:", ftir_loader.get_available_sites())
print("Available parameters:", ftir_loader.get_available_parameters())

In [None]:
# %%
# Cell 3: Modified Dual Dataset Processor to use FTIR CSV
class DualDatasetProcessorWithCSV(DualDatasetProcessor):
    """Extended processor that can use FTIR CSV data instead of database"""
    
    def __init__(self, config, setup=None, ftir_csv_loader=None):
        super().__init__(config, setup)
        self.ftir_csv_loader = ftir_csv_loader
    
    def _load_ftir_data(self) -> pd.DataFrame:
        """Load FTIR data from CSV instead of database"""
        
        if self.ftir_csv_loader is None:
            # Fall back to original database loading
            return super()._load_ftir_data()
        
        try:
            # Load from CSV
            ftir_data = self.ftir_csv_loader.load_site_data(self.config.site_code)
            
            # Ensure datetime column
            ftir_data['sample_date'] = pd.to_datetime(ftir_data['sample_date'])
            
            print(f"✅ Loaded FTIR data from CSV: {len(ftir_data)} samples")
            print(f"📅 FTIR date range: {ftir_data['sample_date'].min()} to {ftir_data['sample_date'].max()}")
            
            return ftir_data
            
        except Exception as e:
            print(f"❌ Could not load FTIR data from CSV: {e}")
            print("Falling back to database loading...")
            return super()._load_ftir_data()

# 🎛️ Configuration: Toggle Ethiopia fix here
APPLY_ETHIOPIA_FIX = True  # Set to True to enable Ethiopia pneumatic pump fix

print(f"🚀 DUAL-DATASET PROCESSING {'WITH' if APPLY_ETHIOPIA_FIX else 'WITHOUT'} Ethiopia Fix")
print("=" * 70)
print("📄 Using FTIR CSV data instead of database")

# Initialize the processor with CSV loader
processor = DualDatasetProcessorWithCSV(config, setup, ftir_csv_loader=ftir_loader)
print("✅ Processor initialized with FTIR CSV support")

In [None]:
# %%
# Cell 4: Run the Dual Dataset Processing
# This performs all processing steps and creates both datasets

# Run the dual dataset processing
datasets = processor.process_dual_datasets(
    apply_ethiopia_fix=APPLY_ETHIOPIA_FIX,
    save_outputs=True,
    output_dir='processed_data_csv'
)

# Access your datasets
high_resolution_data = datasets['high_resolution']
ftir_matched_data = datasets['ftir_matched']
raw_data = datasets['raw_data']
cleaned_data = datasets['cleaned_data']
ftir_data = datasets['ftir_data']

print(f"\n🎉 Processing Complete!")
print(f"📈 High-resolution data: {high_resolution_data.shape}")
print(f"🔗 FTIR-matched data: {ftir_matched_data.shape if len(ftir_matched_data) > 0 else 'No matching periods found'}")

In [None]:
# %%
# Cell 5: Verify FTIR CSV Integration
print("📊 FTIR CSV INTEGRATION VERIFICATION")
print("=" * 60)

if 'ftir_data' in datasets and datasets['ftir_data'] is not None:
    ftir_df = datasets['ftir_data']
    print(f"\n🧪 FTIR Data Summary:")
    print(f"  Total samples: {len(ftir_df)}")
    print(f"  Date range: {ftir_df['sample_date'].min()} to {ftir_df['sample_date'].max()}")
    print(f"  Site code: {ftir_df['site_code'].unique()}")
    
    # Show available FTIR parameters
    ftir_params = [col for col in ftir_df.columns if col not in ['sample_date', 'site_code']]
    print(f"  Parameters: {ftir_params}")
    
    # Show sample of data
    if 'ec_ftir' in ftir_df.columns and 'oc_ftir' in ftir_df.columns:
        print(f"\n  EC range: {ftir_df['ec_ftir'].min():.2f} - {ftir_df['ec_ftir'].max():.2f} µg/m³")
        print(f"  OC range: {ftir_df['oc_ftir'].min():.2f} - {ftir_df['oc_ftir'].max():.2f} µg/m³")
        print(f"  EC/OC ratio: {(ftir_df['ec_ftir']/ftir_df['oc_ftir']).mean():.2f} ± {(ftir_df['ec_ftir']/ftir_df['oc_ftir']).std():.2f}")

# Check FTIR-matched dataset
if len(ftir_matched_data) > 0:
    print("\n🔗 FTIR-Matched Dataset Verification:")
    print(f"  Shape: {ftir_matched_data.shape}")
    print(f"  Date range: {ftir_matched_data.index.min()} to {ftir_matched_data.index.max()}")
    
    # Check for FTIR columns in merged data
    ftir_cols_in_merged = [col for col in ftir_matched_data.columns if 'ftir' in col.lower()]
    print(f"  FTIR columns in merged data: {ftir_cols_in_merged}")
    
    # Verify 9am timestamps
    if hasattr(ftir_matched_data.index, 'hour'):
        unique_hours = ftir_matched_data.index.hour.unique()
        print(f"  Timestamp hours: {unique_hours} (should be [9] for 9am alignment)")
    
    # Show sample of matched data
    print("\n  Sample of matched data (first 5 rows):")
    display_cols = ['IR BCc', 'Blue BCc', 'ec_ftir', 'oc_ftir']
    available_cols = [col for col in display_cols if col in ftir_matched_data.columns]
    if available_cols:
        print(ftir_matched_data[available_cols].head())
else:
    print("\n⚠️ No FTIR-matched data available")
    print("  This could mean:")
    print("  1. No overlapping dates between aethalometer and FTIR data")
    print("  2. Different date formats or timezone issues")
    print("  3. Site code mismatch")

In [None]:
# %%
# Cell 6: Analysis - BC vs FTIR EC Comparison
import matplotlib.pyplot as plt
from scipy import stats

if len(ftir_matched_data) > 0 and 'ec_ftir' in ftir_matched_data.columns:
    print("📊 AETHALOMETER BC vs FTIR EC ANALYSIS")
    print("=" * 50)
    
    # Find BC column (prefer corrected if Ethiopia fix was applied)
    bc_cols = [col for col in ftir_matched_data.columns if 'BCc' in col]
    bc_corrected_cols = [col for col in bc_cols if 'corrected' in col]
    
    if APPLY_ETHIOPIA_FIX and bc_corrected_cols:
        # Use Ethiopia-corrected BC
        bc_col = next((col for col in bc_corrected_cols if 'IR' in col), bc_corrected_cols[0])
        bc_type = "Ethiopia-corrected"
    else:
        # Use original BC
        bc_col = next((col for col in bc_cols if 'IR' in col and 'corrected' not in col), bc_cols[0])
        bc_type = "Original"
    
    print(f"Using {bc_type} BC column: {bc_col}")
    
    # Prepare data
    bc_data = ftir_matched_data[bc_col].dropna()
    ec_data = ftir_matched_data['ec_ftir'].dropna()
    common_idx = bc_data.index.intersection(ec_data.index)
    
    if len(common_idx) > 3:
        # Convert BC from ng/m³ to µg/m³
        x = bc_data.loc[common_idx] / 1000
        y = ec_data.loc[common_idx]
        
        # Calculate regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        
        # Create comparison plot
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Scatter plot
        ax1.scatter(x, y, alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
        
        # Add regression line
        line_x = np.linspace(x.min(), x.max(), 100)
        line_y = slope * line_x + intercept
        ax1.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                label=f'y = {slope:.3f}x + {intercept:.3f}')
        
        # 1:1 line
        max_val = max(x.max(), y.max())
        min_val = min(x.min(), y.min())
        ax1.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
        
        ax1.set_xlabel(f'Aethalometer {bc_type} BC (µg/m³)', fontsize=12)
        ax1.set_ylabel('FTIR EC (µg/m³)', fontsize=12)
        ax1.set_title(f'BC vs EC Comparison\nSite: {config.site_code}, n={len(x)}', fontsize=14)
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Add statistics
        stats_text = (f'R² = {r_value**2:.3f}\n'
                     f'Slope = {slope:.3f}\n'
                     f'Intercept = {intercept:.3f}\n'
                     f'p-value = {p_value:.2e}')
        ax1.text(0.05, 0.95, stats_text, transform=ax1.transAxes,
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                verticalalignment='top', fontsize=10)
        
        # Time series
        ax2.plot(x.index, x.values, 'b-', label=f'{bc_type} BC', alpha=0.8, linewidth=2)
        ax2.plot(y.index, y.values, 'r-', label='FTIR EC', alpha=0.8, linewidth=2)
        ax2.set_xlabel('Date', fontsize=12)
        ax2.set_ylabel('Concentration (µg/m³)', fontsize=12)
        ax2.set_title('Time Series Comparison (9am-9am daily averages)', fontsize=14)
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        ax2.tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        # Summary statistics
        print(f"\n📈 ANALYSIS RESULTS:")
        print(f"  Data points: {len(x)}")
        print(f"  R² = {r_value**2:.3f}")
        print(f"  Correlation (r) = {r_value:.3f}")
        print(f"  Slope = {slope:.3f}")
        print(f"  Intercept = {intercept:.3f} µg/m³")
        print(f"  p-value = {p_value:.2e}")
        
        if r_value**2 > 0.7:
            print("  🎉 Excellent agreement between methods!")
        elif r_value**2 > 0.5:
            print("  👍 Good agreement between methods")
        else:
            print("  ⚠️ Moderate to poor agreement - investigate further")
            
    else:
        print(f"⚠️ Insufficient data for analysis ({len(common_idx)} points)")
else:
    print("❌ No FTIR-matched data available for analysis")

In [None]:
# %%
# Cell 7: Export Summary
print("📦 EXPORT SUMMARY")
print("=" * 50)
print(f"\nProcessed data saved to 'processed_data_csv/' directory:")
print(f"\n📈 High-Resolution Dataset:")
print(f"  • aethalometer_high_resolution_{config.site_code}.pkl")
print(f"  • aethalometer_high_resolution_{config.site_code}.csv")
print(f"  • Shape: {high_resolution_data.shape}")
print(f"  • Date range: {high_resolution_data['datetime_local'].min()} to {high_resolution_data['datetime_local'].max()}")

if len(ftir_matched_data) > 0:
    print(f"\n🔗 FTIR-Matched Dataset:")
    print(f"  • aethalometer_ftir_matched_{config.site_code}.pkl")
    print(f"  • aethalometer_ftir_matched_{config.site_code}.csv")
    print(f"  • Shape: {ftir_matched_data.shape}")
    print(f"  • Date range: {ftir_matched_data.index.min()} to {ftir_matched_data.index.max()}")
    print(f"  • Matched samples: {len(ftir_matched_data)}")

print(f"\n🔍 KEY DIFFERENCES FROM DATABASE VERSION:")
print(f"  📄 FTIR data loaded from CSV: {FTIR_CSV_PATH}")
print(f"  🏷️ Site codes in CSV: {ftir_loader.get_available_sites()}")
print(f"  📊 Parameters available: {ftir_loader.get_available_parameters()}")

print(f"\n💡 NEXT STEPS:")
print(f"  1. Use high-resolution data for temporal analysis")
print(f"  2. Use FTIR-matched data for method comparison")
print(f"  3. Apply additional corrections if needed")
print(f"  4. Analyze other sites by changing config.site_code")

# Show how to change sites
print(f"\n🌍 To process other sites:")
for site in ftir_loader.get_available_sites():
    if site != config.site_code:
        print(f"  • Change config.site_code = '{site}' and rerun")

In [None]:
# %%
# Cell 8: Quick Multi-Site Comparison (Optional)
print("🌍 MULTI-SITE FTIR DATA OVERVIEW")
print("=" * 50)

# Load and summarize FTIR data for all sites
site_summaries = {}

for site in ftir_loader.get_available_sites():
    try:
        site_ftir = ftir_loader.load_site_data(site)
        site_summaries[site] = {
            'count': len(site_ftir),
            'date_range': (site_ftir['sample_date'].min(), site_ftir['sample_date'].max()),
            'ec_mean': site_ftir['ec_ftir'].mean() if 'ec_ftir' in site_ftir.columns else None,
            'oc_mean': site_ftir['oc_ftir'].mean() if 'oc_ftir' in site_ftir.columns else None
        }
    except Exception as e:
        print(f"⚠️ Error loading {site}: {e}")

# Display summary
for site, summary in site_summaries.items():
    print(f"\n📍 {site}:")
    print(f"  Samples: {summary['count']}")
    print(f"  Date range: {summary['date_range'][0].strftime('%Y-%m-%d')} to {summary['date_range'][1].strftime('%Y-%m-%d')}")
    if summary['ec_mean']:
        print(f"  Mean EC: {summary['ec_mean']:.2f} µg/m³")
    if summary['oc_mean']:
        print(f"  Mean OC: {summary['oc_mean']:.2f} µg/m³")

# Create comparison plot if multiple sites have data
if len(site_summaries) > 1:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    sites = list(site_summaries.keys())
    ec_means = [site_summaries[s]['ec_mean'] for s in sites if site_summaries[s]['ec_mean']]
    oc_means = [site_summaries[s]['oc_mean'] for s in sites if site_summaries[s]['oc_mean']]
    
    x = np.arange(len(sites))
    width = 0.35
    
    if ec_means:
        ax.bar(x - width/2, ec_means, width, label='EC', alpha=0.8)
    if oc_means:
        ax.bar(x + width/2, oc_means, width, label='OC', alpha=0.8)
    
    ax.set_xlabel('Site', fontsize=12)
    ax.set_ylabel('Mean Concentration (µg/m³)', fontsize=12)
    ax.set_title('FTIR EC/OC Comparison Across Sites', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(sites)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()

print(f"\n✅ Multi-site overview complete!")
print(f"💡 Each site can be processed individually by updating config.site_code")