# %%
# Setup and imports
import sys
import os
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import enhanced setup with FTIR CSV capabilities
from data.qc.enhanced_pkl_processing import process_pkl_data_enhanced
from config.notebook_config import NotebookConfig
from notebook_utils.pkl_cleaning_integration import create_enhanced_setup

print("🚀 Setting up environment...")

# Your existing configuration
config = NotebookConfig(
    site_code='ETAD',
    wavelength='Red',
    quality_threshold=10,
    output_format='jpl',
    min_samples_for_analysis=30,
    confidence_level=0.95,
    outlier_threshold=3.0,
    figure_size=(12, 8),
    font_size=10,
    dpi=300
)

# Set your data paths
base_data_path = "/Users/ahzs645/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data"

# FTIR CSV is in the main project directory
config.ftir_csv_path = "/Users/ahzs645/Github/aethmodular-clean/Four_Sites_FTIR_data.v2.csv"

# Create enhanced setup with FTIR capabilities
setup = create_enhanced_setup(config)

print("✅ Setup complete!")
print(f"📁 FTIR CSV path: {config.ftir_csv_path}")

In [None]:
# %%
# Load the already processed Ethiopia data
import pandas as pd

print("📁 Loading pre-processed Ethiopia aethalometer data...")

# Load the Ethiopia-processed data directly from current notebooks directory
ethiopia_pkl_path = 'pkl_data_cleaned_ethiopia.pkl'  # File is in the same directory as this notebook

try:
    pkl_data_cleaned = pd.read_pickle(ethiopia_pkl_path)
    print(f"✅ Loaded Ethiopia-processed data: {pkl_data_cleaned.shape}")
    print(f"📅 Date range: {pkl_data_cleaned['datetime_local'].min()} to {pkl_data_cleaned['datetime_local'].max()}")
    
    # Check for Ethiopia correction columns
    ethiopia_cols = [col for col in pkl_data_cleaned.columns if any(x in col for x in ['corrected', 'manual', 'optimized'])]
    print(f"🔧 Ethiopia correction columns found: {len(ethiopia_cols)}")
    
    # Store in setup for easy access
    setup.datasets['ethiopia_processed'] = pkl_data_cleaned
    
except FileNotFoundError:
    print(f"❌ File not found: {ethiopia_pkl_path}")
    print("📍 Current working directory:", os.getcwd())
    print("\n🔍 Looking for the file in alternative locations...")
    
    # Try alternative paths
    alternative_paths = [
        'pkl_data_cleaned_ethiopia.pkl',
        '../pkl_data_cleaned_ethiopia.pkl',
        './notebooks/pkl_data_cleaned_ethiopia.pkl',
        os.path.join(os.getcwd(), 'pkl_data_cleaned_ethiopia.pkl')
    ]
    
    for alt_path in alternative_paths:
        if os.path.exists(alt_path):
            print(f"✅ Found file at: {alt_path}")
            pkl_data_cleaned = pd.read_pickle(alt_path)
            print(f"✅ Loaded Ethiopia-processed data: {pkl_data_cleaned.shape}")
            setup.datasets['ethiopia_processed'] = pkl_data_cleaned
            break
    else:
        print("❌ Could not find pkl_data_cleaned_ethiopia.pkl in any expected location")
        print("\n💡 Please ensure you have run the processing with APPLY_ETHIOPIA_FIX=True first")

In [None]:
# %%
# Create 9am-to-9am daily averages for FTIR merging
print("📊 Creating 9am-to-9am daily averaged aethalometer data for FTIR merge...")

# Ensure datetime_local is datetime type and has timezone info
pkl_data_cleaned['datetime_local'] = pd.to_datetime(pkl_data_cleaned['datetime_local'])

# Set datetime_local as index for resampling
df_indexed = pkl_data_cleaned.set_index('datetime_local')

# If timezone is not set, localize to Africa/Addis_Ababa
if df_indexed.index.tz is None:
    df_indexed.index = df_indexed.index.tz_localize('Africa/Addis_Ababa')
    print("🌍 Localized timezone to Africa/Addis_Ababa")
elif df_indexed.index.tz.zone != 'Africa/Addis_Ababa':
    df_indexed.index = df_indexed.index.tz_convert('Africa/Addis_Ababa')
    print(f"🌍 Converted timezone from {df_indexed.index.tz} to Africa/Addis_Ababa")

# Define the 9am-to-9am resampling function
def resample_9am_to_9am(df, min_coverage=0.5):
    """
    Resample data from 9am to 9am the next day.
    
    Args:
        df (pd.DataFrame): DataFrame with datetime index
        min_coverage (float): Minimum fraction of valid data required (0.5 = 50%)
    
    Returns:
        pd.DataFrame: Daily averaged data with 9am timestamps
    """
    # Shift time back by 9 hours so that 9am becomes the start of the day
    df_shifted = df.copy()
    df_shifted.index = df_shifted.index - pd.Timedelta(hours=9)
    
    # Calculate the number of valid (non-null) data points per day for each column
    daily_counts = df_shifted.resample('D').count()
    
    # Calculate daily means
    daily_means = df_shifted.resample('D').mean()
    
    # Expected number of hourly points per day (24 hours)
    # Assuming your data is hourly, adjust if different
    expected_points = 24
    
    # For each column, mask days that don't have enough coverage
    for col in daily_means.columns:
        if col in daily_counts.columns:
            coverage = daily_counts[col] / expected_points
            daily_means.loc[coverage < min_coverage, col] = np.nan
    
    # Shift the index forward by 9 hours to get 9am timestamps
    daily_means.index = daily_means.index + pd.Timedelta(hours=9)
    
    return daily_means

# Apply 9am-to-9am resampling
print("🕐 Applying 9am-to-9am resampling...")
pkl_data_cleaned_daily = resample_9am_to_9am(df_indexed, min_coverage=0.5)

# Ensure the index is named datetime_local
pkl_data_cleaned_daily.index.name = 'datetime_local'

print(f"✅ Daily averaged data: {pkl_data_cleaned_daily.shape}")
print(f"📅 Date range: {pkl_data_cleaned_daily.index.min()} to {pkl_data_cleaned_daily.index.max()}")
print(f"📊 Sample of available columns: {list(pkl_data_cleaned_daily.columns[:10])}")

# Check data coverage
valid_days = pkl_data_cleaned_daily['IR BCc_corrected'].notna().sum() if 'IR BCc_corrected' in pkl_data_cleaned_daily.columns else 0
total_days = len(pkl_data_cleaned_daily)
print(f"📈 Data coverage: {valid_days}/{total_days} days with valid data ({valid_days/total_days*100:.1f}%)") if total_days > 0 else None

In [None]:
# %%
# NEW: Merge 9am-to-9am aethalometer data with FTIR data
print("🔗 Merging 9am-to-9am aethalometer data with FTIR data...")

# Store daily data in setup for merging
setup.datasets['daily_aethalometer_9am'] = pkl_data_cleaned_daily

# Automatic merge with FTIR data
try:
    merged_data = setup.merge_with_ftir(
        aethalometer_dataset='daily_aethalometer_9am',
        site_code='ETAD',  # Will automatically handle site name variations
        store_as='aethalometer_ftir_merged_9am'
    )
    
    print(f"\n🎉 Merge successful! Final dataset: {merged_data.shape}")
    print(f"\n📊 Merge statistics:")
    print(f"  🕐 Aethalometer daily points (9am-9am): {len(pkl_data_cleaned_daily)}")
    print(f"  🧪 Merged points with FTIR: {len(merged_data)}")
    print(f"  📈 Match rate: {len(merged_data)/len(pkl_data_cleaned_daily)*100:.1f}%")
    
    print(f"\n📊 Available columns:")
    ftir_cols = [col for col in merged_data.columns if 'ftir' in col.lower()]
    aeth_cols = [col for col in merged_data.columns if any(x in col for x in ['BCc', 'ATN', 'corrected'])]
    print(f"  🔧 Aethalometer columns: {len(aeth_cols)} (including Ethiopia corrections)")
    print(f"  🧪 FTIR columns: {len(ftir_cols)} ({ftir_cols})")
    
    # Show date overlap
    print(f"\n📅 Merged data date range:")
    print(f"  From: {merged_data.index.min()}")
    print(f"  To: {merged_data.index.max()}")
    print(f"  Total days: {len(merged_data)}")
    
except Exception as e:
    print(f"❌ Error during merge: {e}")
    print(f"\n🔍 Debugging information:")
    print(f"  Daily data shape: {pkl_data_cleaned_daily.shape}")
    print(f"  Daily data index type: {type(pkl_data_cleaned_daily.index)}")
    print(f"  Has timezone? {pkl_data_cleaned_daily.index.tz is not None}")
    
    # Try manual exploration
    print(f"\n🔍 Exploring FTIR data for troubleshooting...")
    setup.explore_ftir_data()

In [None]:
# %%
# Analysis: Compare Ethiopia-corrected aethalometer with FTIR (9am-to-9am aligned)
print("📊 Analysis: Ethiopia-corrected aethalometer vs FTIR comparison (9am-to-9am)")

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

if 'merged_data' in locals() and 'EC_ftir' in merged_data.columns and 'IR BCc_corrected' in merged_data.columns:
    # Compare Ethiopia-corrected BCc with FTIR EC
    x = merged_data['IR BCc_corrected'].dropna()
    y = merged_data.loc[x.index, 'EC_ftir'].dropna()
    
    # Get common indices
    common_idx = x.index.intersection(y.index)
    x_clean = x.loc[common_idx]
    y_clean = y.loc[common_idx]
    
    if len(x_clean) > 5:
        # Calculate correlation and regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x_clean, y_clean)
        
        # Graph 1: Scatter plot WITHOUT regression line
        fig1, ax1 = plt.subplots(1, 1, figsize=(10, 8))
        ax1.scatter(x_clean, y_clean, alpha=0.7, color='blue', s=50)
        ax1.set_xlabel('Ethiopia-Corrected IR BCc (ng/m³)', fontsize=12)
        ax1.set_ylabel('FTIR EC (µg/m³)', fontsize=12)
        ax1.set_title(f'Aethalometer vs FTIR (Raw Data)\\nSite: ETAD (9am-9am), n={len(x_clean)}', fontsize=14)
        ax1.grid(True, alpha=0.3)
        
        # Add statistics text without regression line
        stats_text = f'R² = {r_value**2:.3f}\\np-value = {p_value:.3e}'
        ax1.text(0.05, 0.95, stats_text, transform=ax1.transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                verticalalignment='top', fontsize=11)
        
        plt.tight_layout()
        plt.show()
        
        # Graph 2: Scatter plot WITH regression line
        fig2, ax2 = plt.subplots(1, 1, figsize=(10, 8))
        ax2.scatter(x_clean, y_clean, alpha=0.7, color='blue', s=50)
        
        # Add regression line
        line_x = np.linspace(x_clean.min(), x_clean.max(), 100)
        line_y = slope * line_x + intercept
        ax2.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, label=f'y = {slope:.3f}x + {intercept:.3f}')
        
        ax2.set_xlabel('Ethiopia-Corrected IR BCc (ng/m³)', fontsize=12)
        ax2.set_ylabel('FTIR EC (µg/m³)', fontsize=12)
        ax2.set_title(f'Aethalometer vs FTIR (With Regression)\\nSite: ETAD (9am-9am), n={len(x_clean)}', fontsize=14)
        
        # Add statistics text with regression info
        stats_text = f'R² = {r_value**2:.3f}\\nSlope = {slope:.3f}\\nIntercept = {intercept:.3f}\\np-value = {p_value:.3e}'
        ax2.text(0.05, 0.95, stats_text, transform=ax2.transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                verticalalignment='top', fontsize=11)
        
        ax2.grid(True, alpha=0.3)
        ax2.legend()
        
        plt.tight_layout()
        plt.show()
        
        # Graph 3: Time series comparison
        fig3, ax3 = plt.subplots(1, 1, figsize=(14, 6))
        ax3.plot(x_clean.index, x_clean.values, 'b-', label='Ethiopia-Corrected IR BCc', alpha=0.7, linewidth=2)
        ax3.plot(y_clean.index, y_clean.values, 'r-', label='FTIR EC', alpha=0.7, linewidth=2)
        ax3.set_xlabel('Date', fontsize=12)
        ax3.set_ylabel('Concentration', fontsize=12)
        ax3.set_title('Time Series Comparison (9am-9am aligned)', fontsize=14)
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        ax3.tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        print(f"\n📈 Correlation Results:")
        print(f"  R² = {r_value**2:.3f}")
        print(f"  Slope = {slope:.3f}")
        print(f"  Intercept = {intercept:.3f}")
        print(f"  p-value = {p_value:.3e}")
        print(f"  Data points = {len(x_clean)}")
        
        # Show if Ethiopia fix improved correlation
        if r_value**2 > 0.5:
            print(f"\n🎉 Excellent correlation! Ethiopia fix is working well.")
        elif r_value**2 > 0.3:
            print(f"\n👍 Good correlation! Ethiopia fix shows improvement.")
        else:
            print(f"\n📊 Moderate correlation. Consider data quality checks.")
            
        # Compare original BCc vs corrected BCc if available
        if 'IR BCc' in merged_data.columns:
            x_orig = merged_data['IR BCc'].dropna()
            common_orig = x_orig.index.intersection(y.index)
            if len(common_orig) > 5:
                x_orig_clean = x_orig.loc[common_orig]
                y_orig_clean = y.loc[common_orig]
                _, _, r_orig, _, _ = stats.linregress(x_orig_clean, y_orig_clean)
                
                print(f"\n🔄 Improvement from Ethiopia Fix:")
                print(f"  Original BCc R² = {r_orig**2:.3f}")
                print(f"  Corrected BCc R² = {r_value**2:.3f}")
                print(f"  Improvement = {(r_value**2 - r_orig**2):.3f} ({((r_value**2 - r_orig**2)/r_orig**2)*100:+.1f}%)")
                
    else:
        print(f"⚠️ Insufficient overlapping data points ({len(x_clean)}) for analysis")
else:
    print(f"⚠️ Required data or columns not found for analysis")
    if 'merged_data' not in locals():
        print("  merged_data not created - check merge step")
    else:
        print(f"  Available columns: {list(merged_data.columns)}")

In [None]:
# %%
# Apply Linear Regression Correction based on FTIR relationship
print("🔧 Applying Linear Regression Correction: IR BCc 2 * 0.85 - 0.17")

if 'merged_data' in locals() and 'IR BCc_corrected' in merged_data.columns:
    # Convert from ng/m³ to μg/m³ first (divide by 1000)
    merged_data['IR BCc_corrected_ug'] = merged_data['IR BCc_corrected'] / 1000
    
    # Apply linear regression correction: slope * x + intercept
    # Using your specified values: slope=0.85, intercept=-0.17
    merged_data['IR BCc_regression_corrected'] = merged_data['IR BCc_corrected_ug'] * 0.85 - 0.17
    
    print("✅ Applied corrections:")
    print(f"  1. Unit conversion: ng/m³ → μg/m³ (÷ 1000)")
    print(f"  2. Linear regression: BCc * 0.85 - 0.17")
    
    # Compare before and after regression correction
    if 'EC_ftir' in merged_data.columns:
        # Get clean data for comparison
        x_before = merged_data['IR BCc_corrected_ug'].dropna()
        x_after = merged_data['IR BCc_regression_corrected'].dropna()
        y = merged_data['EC_ftir'].dropna()
        
        # Find common indices
        common_before = x_before.index.intersection(y.index)
        common_after = x_after.index.intersection(y.index)
        
        if len(common_before) > 5 and len(common_after) > 5:
            # Calculate correlations
            from scipy import stats
            
            # Before regression correction
            x_before_clean = x_before.loc[common_before]
            y_before_clean = y.loc[common_before]
            _, _, r_before, _, _ = stats.linregress(x_before_clean, y_before_clean)
            
            # After regression correction
            x_after_clean = x_after.loc[common_after]
            y_after_clean = y.loc[common_after]
            _, _, r_after, _, _ = stats.linregress(x_after_clean, y_after_clean)
            
            print(f"\n📊 Regression Correction Impact:")
            print(f"  Before correction R² = {r_before**2:.3f}")
            print(f"  After correction R² = {r_after**2:.3f}")
            print(f"  Improvement = {(r_after**2 - r_before**2):.3f}")
            
            # Graph 4: Comparison plot showing regression correction effect
            fig4, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
            
            # Before correction
            ax1.scatter(x_before_clean, y_before_clean, alpha=0.7, color='blue', s=50)
            ax1.set_xlabel('Ethiopia-Corrected IR BCc (μg/m³)', fontsize=12)
            ax1.set_ylabel('FTIR EC (μg/m³)', fontsize=12)
            ax1.set_title(f'Before Regression Correction\\nR² = {r_before**2:.3f}', fontsize=14)
            ax1.grid(True, alpha=0.3)
            
            # After correction
            ax2.scatter(x_after_clean, y_after_clean, alpha=0.7, color='red', s=50)
            ax2.set_xlabel('Regression-Corrected IR BCc (μg/m³)', fontsize=12)
            ax2.set_ylabel('FTIR EC (μg/m³)', fontsize=12)
            ax2.set_title(f'After Regression Correction\\nR² = {r_after**2:.3f}', fontsize=14)
            ax2.grid(True, alpha=0.3)
            
            # Add 1:1 line to after correction plot
            min_val = min(x_after_clean.min(), y_after_clean.min())
            max_val = max(x_after_clean.max(), y_after_clean.max())
            ax2.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5, label='1:1 line')
            ax2.legend()
            
            plt.tight_layout()
            plt.show()
            
            # Graph 5: Time series with regression correction
            fig5, ax5 = plt.subplots(1, 1, figsize=(14, 6))
            ax5.plot(x_before_clean.index, x_before_clean.values, 'b-', 
                    label='Ethiopia-Corrected BCc (μg/m³)', alpha=0.7, linewidth=2)
            ax5.plot(x_after_clean.index, x_after_clean.values, 'g-', 
                    label='Regression-Corrected BCc (μg/m³)', alpha=0.7, linewidth=2)
            ax5.plot(y_after_clean.index, y_after_clean.values, 'r-', 
                    label='FTIR EC (μg/m³)', alpha=0.7, linewidth=2)
            ax5.set_xlabel('Date', fontsize=12)
            ax5.set_ylabel('Concentration (μg/m³)', fontsize=12)
            ax5.set_title('Time Series: Before vs After Regression Correction', fontsize=14)
            ax5.legend()
            ax5.grid(True, alpha=0.3)
            ax5.tick_params(axis='x', rotation=45)
            
            plt.tight_layout()
            plt.show()
            
    print(f"\n💾 Available corrected columns:")
    correction_cols = [col for col in merged_data.columns if 'corrected' in col or 'regression' in col]
    for col in correction_cols:
        print(f"  📊 {col}")
        
else:
    print("❌ Merged data not available. Run previous cells first.")

In [None]:
# %%
# Summary and save merged data
print("📋 Complete Pipeline Summary:")
print("=" * 50)

# Pipeline summary
print("\n🔄 Processing Steps:")
print("1. ✅ Loaded pre-processed Ethiopia aethalometer data")
print("2. ✅ Created 9am-to-9am daily averages")
print("3. ✅ Loaded FTIR CSV data")
print("4. ✅ Merged with time-aligned FTIR data")
print("5. ✅ Analyzed correlation between corrected BCc and FTIR EC")

# Data summary
print(f"\n📊 Data Summary:")
if 'pkl_data_cleaned' in locals():
    print(f"  Original Ethiopia data: {pkl_data_cleaned.shape}")
if 'pkl_data_cleaned_daily' in locals():
    print(f"  Daily averaged (9am-9am): {pkl_data_cleaned_daily.shape}")
if 'merged_data' in locals():
    print(f"  Final merged data: {merged_data.shape}")
    
    # Save the merged data
    output_path = 'aethalometer_ftir_merged_9am_ethiopia.pkl'
    merged_data.to_pickle(output_path)
    print(f"\n💾 Saved merged data to: {output_path}")

# Show all available datasets
print(f"\n💾 All available datasets in setup:")
for name, data in setup.datasets.items():
    print(f"  📊 {name}: {data.shape}")

# FTIR merge summary
setup.get_ftir_summary()

print(f"\n🎯 Key Benefits of This Pipeline:")
print(f"✅ Ethiopia pneumatic pump fix applied")
print(f"✅ Proper 9am-to-9am time alignment")
print(f"✅ FTIR data integration from CSV")
print(f"✅ Automated site configuration")
print(f"✅ Unit conversion (ng/m³ → µg/m³)")
print(f"✅ Timezone handling")
print(f"✅ Validation against independent FTIR measurements")

In [None]:
# %%
# Summary of the complete pipeline
print("📋 Complete Pipeline Summary:")
print("=" * 50)

setup.print_enhanced_summary()

print(f"\n🎯 Pipeline Benefits:")
print(f"✅ Ethiopia pneumatic pump fix applied")
print(f"✅ FTIR data integration from CSV")
print(f"✅ Automated site configuration")
print(f"✅ Unit conversion (ng/m³ → µg/m³)")
print(f"✅ Timezone handling")
print(f"✅ Quality control and cleaning")

print(f"\n💾 Available datasets:")
for name, data in setup.datasets.items():
    print(f"  📊 {name}: {data.shape}")