In [None]:
# Setup and imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
from typing import Optional

print("🚀 Setting up SPARTAN Speciation Analysis Environment...")

# Set your paths here
FTIR_CSV_PATH = "/Users/ahzs645/Github/aethmodular-clean/Four_Sites_FTIR_data.v2.csv"
SPARTAN_SPECIATION_PATH = "/Users/ahzs645/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data/EC-HIPS-Aeth Comparison/Data/Downloaded Data/SPARTAN/Addis Ababa/FilterBased_ChemSpecPM25_ETAD.csv"
ETHIOPIA_PKL_PATH = "pkl_data_cleaned_ethiopia.pkl"  # Your processed aethalometer data

print("✅ Setup complete!")
print(f"📁 FTIR CSV: {Path(FTIR_CSV_PATH).exists()}")
print(f"🧪 SPARTAN Speciation: {Path(SPARTAN_SPECIATION_PATH).exists()}")
print(f"🔧 Ethiopia PKL: {Path(ETHIOPIA_PKL_PATH).exists()}")

In [None]:
# Data Loading and Processing Modules
class FTIRCSVLoader:
    """Load and process FTIR data from CSV files"""
    
    def __init__(self, csv_path: str):
        self.csv_path = Path(csv_path)
        if not self.csv_path.exists():
            raise FileNotFoundError(f"FTIR CSV file not found: {self.csv_path}")
        
    def load_site_data(self, site_code: str, parameters=None) -> pd.DataFrame:
        """Load FTIR data for a specific site"""
        print(f"📊 Loading FTIR data for site {site_code}...")
        
        # Load the full CSV
        df = pd.read_csv(self.csv_path)
        
        # Filter by site
        site_data = df[df['Site'] == site_code].copy()
        
        if len(site_data) == 0:
            available_sites = df['Site'].unique()
            raise ValueError(f"No data found for site '{site_code}'. Available sites: {list(available_sites)}")
        
        # Convert sample date to datetime
        site_data['SampleDate'] = pd.to_datetime(site_data['SampleDate'])
        
        # Filter parameters if specified
        if parameters:
            site_data = site_data[site_data['Parameter'].isin(parameters)]
        
        # Pivot to get parameters as columns
        pivot_data = site_data.pivot_table(
            index='SampleDate',
            columns='Parameter',
            values='Concentration_ug_m3',
            aggfunc='mean'  # Average if multiple measurements per day
        ).reset_index()
        
        # Set datetime index
        pivot_data.set_index('SampleDate', inplace=True)
        pivot_data.index.name = 'datetime_local'
        
        print(f"✅ Loaded {len(pivot_data)} FTIR measurements")
        print(f"📅 Date range: {pivot_data.index.min()} to {pivot_data.index.max()}")
        print(f"🧪 Parameters: {list(pivot_data.columns)}")
        
        return pivot_data


def load_spartan_speciation_data(csv_path: str) -> Optional[pd.DataFrame]:
    """Load SPARTAN speciation data with proper unit handling"""
    
    print("🧪 Loading SPARTAN speciation data...")
    
    try:
        # Load CSV (skip first 3 lines to get to header)
        df = pd.read_csv(csv_path, skiprows=3)
        print(f"   Raw data: {len(df)} rows, {len(df.columns)} columns")
        
        # Filter for ETAD
        df = df[df['Site_Code'] == 'ETAD'].copy()
        print(f"   ETAD data: {len(df)} rows")
        
        # Create datetime
        def create_timestamp(row):
            try:
                return pd.Timestamp(
                    year=int(row['Start_Year_local']),
                    month=int(row['Start_Month_local']),
                    day=int(row['Start_Day_local']),
                    hour=int(row['Start_hour_local'])
                )
            except:
                return pd.NaT
        
        df['Start_Date'] = df.apply(create_timestamp, axis=1)
        df = df[df['Start_Date'].notna()].copy()
        
        print(f"   ✅ Created datetime for {len(df)} rows")
        
        # Map parameter codes to clean names with units
        parameter_mapping = {
            28101: 'PM25_mass',          # μg/m³
            28202: 'BC_PM25',            # μg/m³  
            28401: 'Sulfate_Ion',        # μg/m³
            28402: 'Nitrate_Ion',        # μg/m³
            28403: 'Phosphate_Ion',      # μg/m³
            28404: 'Nitrite_Ion',        # μg/m³
            28801: 'Sodium_Ion',         # μg/m³
            28802: 'Ammonium_Ion',       # μg/m³
            28803: 'Potassium_Ion',      # μg/m³
            28804: 'Magnesium_Ion',      # ng/m³ (note different unit!)
            28805: 'Calcium_Ion',        # μg/m³
            28902: 'Aluminum',           # ng/m³
            28904: 'Titanium',           # ng/m³
            28905: 'Vanadium',           # ng/m³
            28906: 'Chromium',           # ng/m³
            28907: 'Manganese',          # ng/m³
            28908: 'Iron',               # ng/m³
            28909: 'Cobalt',             # ng/m³
            28910: 'Nickel',             # ng/m³
            28911: 'Copper',             # ng/m³
            28912: 'Zinc',               # ng/m³
            28913: 'Arsenic',            # ng/m³
            28914: 'Selenium',           # ng/m³
            28916: 'Cadmium',            # ng/m³
            28917: 'Antimony',           # ng/m³
            28919: 'Cerium',             # ng/m³
            28920: 'Lead',               # ng/m³
            28921: 'Rubidium',           # ng/m³
            28922: 'Strontium',          # ng/m³
            28923: 'Silicon',            # ng/m³
            28924: 'Sulfur',             # ng/m³
            28925: 'Chlorine',           # ng/m³
            28926: 'Tin'                 # ng/m³
        }
        
        # Unit conversion mapping (ng/m³ to μg/m³ for consistency)
        ng_to_ug_species = [
            'Magnesium_Ion', 'Aluminum', 'Titanium', 'Vanadium', 'Chromium', 
            'Manganese', 'Iron', 'Cobalt', 'Nickel', 'Copper', 'Zinc', 
            'Arsenic', 'Selenium', 'Cadmium', 'Antimony', 'Cerium', 'Lead',
            'Rubidium', 'Strontium', 'Silicon', 'Sulfur', 'Chlorine', 'Tin'
        ]
        
        # Map parameter codes
        df['Parameter_Name_Clean'] = df['Parameter_Code'].map(parameter_mapping)
        mapped_count = df['Parameter_Name_Clean'].notna().sum()
        print(f"   📊 Mapped {mapped_count} parameter measurements")
        
        # Keep only mapped parameters
        df_mapped = df[df['Parameter_Name_Clean'].notna()].copy()
        
        # Pivot to wide format
        speciation_wide = df_mapped.pivot_table(
            index=['Filter_ID', 'Start_Date'],
            columns='Parameter_Name_Clean',
            values='Value',
            aggfunc='first'
        ).reset_index()
        
        speciation_wide.columns.name = None
        
        # Convert ng/m³ to μg/m³ for consistency
        for species in ng_to_ug_species:
            if species in speciation_wide.columns:
                speciation_wide[species] = speciation_wide[species] / 1000  # ng/m³ → μg/m³
        
        print(f"   ✅ Created wide format: {len(speciation_wide)} unique samples")
        print(f"   📏 All species now in μg/m³ units")
        print(f"   🧪 Available species: {[col for col in speciation_wide.columns if col not in ['Filter_ID', 'Start_Date']]}")
        
        return speciation_wide
        
    except Exception as e:
        print(f"❌ Error loading speciation data: {e}")
        return None


def resample_9am_to_9am(df: pd.DataFrame, datetime_col: str = 'datetime_local', 
                       timezone: str = 'Africa/Addis_Ababa', min_hours: int = 4) -> pd.DataFrame:
    """Resample data from 9am to 9am next day"""
    df_work = df.copy()
    
    # Ensure datetime column is datetime type
    df_work[datetime_col] = pd.to_datetime(df_work[datetime_col])
    
    # Set as index
    df_work = df_work.set_index(datetime_col)
    
    # Localize timezone if needed
    if df_work.index.tz is None:
        df_work.index = df_work.index.tz_localize(timezone)
        print(f"🌍 Localized to {timezone}")
    
    # Shift time back by 9 hours so 9am becomes start of day
    df_shifted = df_work.copy()
    df_shifted.index = df_shifted.index - pd.Timedelta(hours=9)
    
    # Get numeric columns only
    numeric_cols = df_shifted.select_dtypes(include=[np.number]).columns
    
    # Resample to daily, calculating mean and count
    daily_means = df_shifted[numeric_cols].resample('D').mean()
    daily_counts = df_shifted[numeric_cols].resample('D').count()
    
    # Filter out days with insufficient data
    for col in numeric_cols:
        insufficient_data = daily_counts[col] < min_hours
        daily_means.loc[insufficient_data, col] = np.nan
    
    # Shift index forward by 9 hours to get 9am timestamps
    daily_means.index = daily_means.index + pd.Timedelta(hours=9)
    daily_means.index.name = 'datetime_local'
    
    return daily_means


print("✅ Data loading modules ready!")

In [None]:
# Load all datasets with enhanced debugging
print("📊 Loading all datasets...")

# 1. Load FTIR data
ftir_loader = FTIRCSVLoader(FTIR_CSV_PATH)
ftir_data = ftir_loader.load_site_data('ETAD')

# 2. Load SPARTAN speciation data with detailed debugging
print(f"\n🔍 Debugging SPARTAN data loading...")
print(f"📁 File path: {SPARTAN_SPECIATION_PATH}")
print(f"📋 File exists: {Path(SPARTAN_SPECIATION_PATH).exists()}")

if Path(SPARTAN_SPECIATION_PATH).exists():
    # Read first few lines to understand file structure
    print(f\"\\n📖 First 10 lines of the file:\")
    with open(SPARTAN_SPECIATION_PATH, 'r') as f:
        for i, line in enumerate(f):
            if i < 10:
                print(f\"   Line {i}: {line.strip()[:100]}...\")
            else:
                break
    
    # Try to load with different skip settings
    try:
        # Try skiprows=3 first
        df_test = pd.read_csv(SPARTAN_SPECIATION_PATH, skiprows=3, nrows=5)
        print(f\"\\n✅ Successfully read with skiprows=3\")
        print(f\"📊 Columns: {list(df_test.columns)[:10]}...\")
        print(f\"📋 Sample data shape: {df_test.shape}\")
        
        # Check for Site_Code column
        if 'Site_Code' in df_test.columns:
            print(f\"✅ Found Site_Code column\")
            unique_sites = pd.read_csv(SPARTAN_SPECIATION_PATH, skiprows=3)['Site_Code'].unique()[:10]
            print(f\"🏢 Available sites: {unique_sites}\")
        else:
            print(f\"⚠️ Site_Code column not found. Available columns: {list(df_test.columns)}\")
        
    except Exception as e:
        print(f\"❌ Error with skiprows=3: {e}\")
        
        # Try without skipping rows
        try:
            df_test = pd.read_csv(SPARTAN_SPECIATION_PATH, nrows=5)
            print(f\"\\n✅ Successfully read without skipping rows\")
            print(f\"📊 Columns: {list(df_test.columns)[:10]}...\")
        except Exception as e2:
            print(f\"❌ Error reading file: {e2}\")

speciation_data = load_spartan_speciation_data(SPARTAN_SPECIATION_PATH)

# 3. Load processed Ethiopia aethalometer data
print(f\"\\n📁 Loading processed Ethiopia aethalometer data...\")
try:
    aethalometer_data = pd.read_pickle(ETHIOPIA_PKL_PATH)
    print(f\"✅ Loaded Ethiopia data: {aethalometer_data.shape}\")
    
    # Apply 9am-to-9am resampling
    print(\"🕐 Applying 9am-to-9am resampling...\")
    daily_aethalometer = resample_9am_to_9am(aethalometer_data, timezone='Africa/Addis_Ababa')
    print(f\"✅ Daily resampling complete: {daily_aethalometer.shape}\")
    
except FileNotFoundError:
    print(f\"❌ Processed file not found: {ETHIOPIA_PKL_PATH}\")
    print(\"Please ensure you have the processed Ethiopia aethalometer data\")
    daily_aethalometer = None

print(f\"\\n📋 Data Loading Summary:\")
print(f\"  🧪 FTIR data: {len(ftir_data) if ftir_data is not None else 0} samples\")
print(f\"  🔬 Speciation data: {len(speciation_data) if speciation_data is not None else 0} samples\")
print(f\"  🔧 Daily aethalometer: {len(daily_aethalometer) if daily_aethalometer is not None else 0} days\")"

In [None]:
# Merge datasets
print("🔗 Merging datasets...")

if ftir_data is not None and daily_aethalometer is not None:
    # Prepare FTIR data for merging
    ftir_data_for_merge = ftir_data.copy()
    
    # Set timestamps to 9am on sample dates with timezone
    ftir_timestamps = (pd.to_datetime(ftir_data_for_merge.index.date).normalize() + 
                      pd.Timedelta(hours=9))
    ftir_timestamps = ftir_timestamps.tz_localize('Africa/Addis_Ababa')
    
    ftir_data_for_merge.index = ftir_timestamps
    ftir_data_for_merge.index.name = 'datetime_local'
    
    # Merge aethalometer with FTIR
    merged_aeth_ftir = pd.merge(
        daily_aethalometer,
        ftir_data_for_merge,
        left_index=True,
        right_index=True,
        how='inner'
    )
    
    print(f"🎉 Aethalometer + FTIR merge: {len(merged_aeth_ftir)} samples")
    
    # If we have speciation data, try to merge it too
    if speciation_data is not None:
        print("🧪 Attempting to merge speciation data...")
        
        # Prepare speciation data
        spec_for_merge = speciation_data.copy()
        
        # Convert Start_Date to 9am timestamps with timezone
        spec_timestamps = (pd.to_datetime(spec_for_merge['Start_Date'].dt.date).normalize() + 
                          pd.Timedelta(hours=9))
        spec_timestamps = spec_timestamps.tz_localize('Africa/Addis_Ababa')
        
        spec_for_merge.index = spec_timestamps
        spec_for_merge.index.name = 'datetime_local'
        
        # Remove non-numeric columns for merging
        spec_numeric = spec_for_merge.select_dtypes(include=[np.number])
        
        # Merge all three datasets
        merged_all = pd.merge(
            merged_aeth_ftir,
            spec_numeric,
            left_index=True,
            right_index=True,
            how='inner'
        )
        
        print(f"🎉 Triple merge (Aeth + FTIR + Speciation): {len(merged_all)} samples")
        
        # Add Ethiopian seasons
        merged_all['month'] = merged_all.index.month
        def get_ethiopian_season(month):
            if month in [10, 11, 12, 1, 2]:
                return 'Dry Season (Bega)'
            elif month in [3, 4, 5]:
                return 'Belg Rainy Season'
            else:
                return 'Kiremt Rainy Season'
        
        merged_all['season'] = merged_all['month'].apply(get_ethiopian_season)
        
        final_merged = merged_all
        
    else:
        print("⚠️ No speciation data to merge")
        final_merged = merged_aeth_ftir
        
    print(f"\n📊 Final merged dataset: {final_merged.shape}")
    
    # Show available columns by category
    all_cols = list(final_merged.columns)
    
    ftir_cols = [col for col in all_cols if any(x in col.lower() for x in ['ec', 'oc', 'ftir'])]
    bc_corrected_cols = [col for col in all_cols if 'BCc' in col and 'corrected' in col]
    spec_cols = [col for col in all_cols if any(x in col for x in ['Iron', 'Aluminum', 'Potassium', 'Sulfate'])]
    
    print(f"\n📊 Available data types:")
    print(f"  🧪 FTIR columns: {ftir_cols}")
    print(f"  🔧 BC corrected columns: {len(bc_corrected_cols)} (IR, Red, Blue, etc.)")
    print(f"  🔬 Speciation columns: {len(spec_cols)} - {spec_cols[:5]}..." if spec_cols else "  🔬 Speciation columns: None")
    
else:
    print("❌ Cannot merge - missing required datasets")
    final_merged = None

In [None]:
# Apply linear regression corrections
print("🔧 Applying Linear Regression Corrections...")

if final_merged is not None:
    # Find BC columns for both IR and Red wavelengths
    ir_bc_col = None
    red_bc_col = None
    
    bc_corrected_cols = [col for col in final_merged.columns if 'BCc' in col and 'corrected' in col]
    
    for col in bc_corrected_cols:
        if 'IR' in col:
            ir_bc_col = col
        elif 'Red' in col:
            red_bc_col = col
    
    print(f"📊 Found BC columns:")
    print(f"  🔴 IR BC: {ir_bc_col}")
    print(f"  🟥 Red BC: {red_bc_col}")
    
    # Apply regression corrections for both wavelengths
    if ir_bc_col:
        # Convert from ng/m³ to μg/m³ and apply regression correction
        final_merged['IR_BC_ug'] = final_merged[ir_bc_col] / 1000  # ng/m³ → μg/m³
        final_merged['IR_BC_regression_corrected'] = final_merged['IR_BC_ug'] * 0.85 - 0.17
        print(f"✅ Applied IR regression correction: BC × 0.85 - 0.17")
    
    if red_bc_col:
        # Convert from ng/m³ to μg/m³ and apply regression correction
        final_merged['Red_BC_ug'] = final_merged[red_bc_col] / 1000  # ng/m³ → μg/m³
        final_merged['Red_BC_regression_corrected'] = final_merged['Red_BC_ug'] * 0.85 - 0.17
        print(f"✅ Applied Red regression correction: BC × 0.85 - 0.17")
    
    # Show Iron data availability for color-coding
    if 'Iron' in final_merged.columns:
        iron_available = final_merged['Iron'].notna().sum()
        print(f"🔬 Iron data available for {iron_available}/{len(final_merged)} samples")
        print(f"   Iron range: {final_merged['Iron'].min():.3f} - {final_merged['Iron'].max():.3f} μg/m³")
    else:
        print(f"⚠️ No Iron data available for color-coding")
    
    print(f"\n💾 Regression correction columns added:")
    regression_cols = [col for col in final_merged.columns if 'regression' in col or col.endswith('_ug')]
    for col in regression_cols:
        print(f"  📊 {col}")
        
else:
    print("❌ No merged data available for regression correction")

In [None]:
# IR Wavelength Scatter Plots with Fe Color-coding
print("📊 Creating IR Wavelength Scatter Plots with Fe Color-coding")

if final_merged is not None and 'EC_ftir' in final_merged.columns:
    
    # Get data for IR wavelength
    ec_ftir = final_merged['EC_ftir'].dropna()
    
    # Check if we have Iron data for color-coding
    use_fe_coloring = 'Iron' in final_merged.columns and final_merged['Iron'].notna().sum() > 5
    
    if use_fe_coloring:
        print("🔬 Using Iron concentration for color-coding")
        colormap = 'viridis'
        color_label = 'Iron (μg/m³)'
    else:
        print("⚠️ No Iron data - using default coloring")
        colormap = None
        color_label = None
    
    # Plot 1: Ethiopia-corrected IR BC vs FTIR EC
    if 'IR_BC_ug' in final_merged.columns:
        ir_bc_ug = final_merged['IR_BC_ug'].dropna()
        common_idx = ir_bc_ug.index.intersection(ec_ftir.index)
        
        if len(common_idx) > 3:
            x_data = ir_bc_ug.loc[common_idx]
            y_data = ec_ftir.loc[common_idx]
            
            # Get Iron data for coloring if available
            if use_fe_coloring:
                color_data = final_merged.loc[common_idx, 'Iron']
                # Remove any NaN Iron values
                valid_color_idx = color_data.notna()
                x_data = x_data[valid_color_idx]
                y_data = y_data[valid_color_idx]
                color_data = color_data[valid_color_idx]
            else:
                color_data = None
            
            # Calculate regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_data, y_data)
            
            # Create plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            if use_fe_coloring and color_data is not None:
                scatter = ax.scatter(x_data, y_data, c=color_data, cmap=colormap, 
                                   alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label(color_label, fontsize=12)
            else:
                ax.scatter(x_data, y_data, alpha=0.7, color='blue', s=60, 
                          edgecolor='black', linewidth=0.5)
            
            # Add regression line
            line_x = np.linspace(x_data.min(), x_data.max(), 100)
            line_y = slope * line_x + intercept
            ax.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                   label=f'y = {slope:.3f}x + {intercept:.3f}')
            
            # Add 1:1 line
            max_val = max(x_data.max(), y_data.max())
            min_val = min(x_data.min(), y_data.min())
            ax.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
            
            ax.set_xlabel('Ethiopia-Corrected IR BC (μg/m³)', fontsize=12)
            ax.set_ylabel('FTIR EC (μg/m³)', fontsize=12)
            ax.set_title(f'IR Wavelength: Aethalometer vs FTIR-EC\n(Ethiopia-corrected, n={len(x_data)})', fontsize=14)
            
            # Statistics text
            stats_text = (f'R² = {r_value**2:.3f}\n'
                         f'Slope = {slope:.3f}\n'
                         f'Intercept = {intercept:.3f}\n'
                         f'p-value = {p_value:.2e}')
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                   verticalalignment='top', fontsize=10)
            
            ax.legend()
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
    
    # Plot 2: Regression-corrected IR BC vs FTIR EC
    if 'IR_BC_regression_corrected' in final_merged.columns:
        ir_bc_regress = final_merged['IR_BC_regression_corrected'].dropna()
        common_idx = ir_bc_regress.index.intersection(ec_ftir.index)
        
        if len(common_idx) > 3:
            x_data = ir_bc_regress.loc[common_idx]
            y_data = ec_ftir.loc[common_idx]
            
            # Get Iron data for coloring if available
            if use_fe_coloring:
                color_data = final_merged.loc[common_idx, 'Iron']
                # Remove any NaN Iron values
                valid_color_idx = color_data.notna()
                x_data = x_data[valid_color_idx]
                y_data = y_data[valid_color_idx]
                color_data = color_data[valid_color_idx]
            else:
                color_data = None
            
            # Calculate regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_data, y_data)
            
            # Create plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            if use_fe_coloring and color_data is not None:
                scatter = ax.scatter(x_data, y_data, c=color_data, cmap=colormap, 
                                   alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label(color_label, fontsize=12)
            else:
                ax.scatter(x_data, y_data, alpha=0.7, color='red', s=60, 
                          edgecolor='black', linewidth=0.5)
            
            # Add regression line
            line_x = np.linspace(x_data.min(), x_data.max(), 100)
            line_y = slope * line_x + intercept
            ax.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                   label=f'y = {slope:.3f}x + {intercept:.3f}')
            
            # Add 1:1 line
            max_val = max(x_data.max(), y_data.max())
            min_val = min(x_data.min(), y_data.min())
            ax.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
            
            ax.set_xlabel('Regression-Corrected IR BC (μg/m³)', fontsize=12)
            ax.set_ylabel('FTIR EC (μg/m³)', fontsize=12)
            ax.set_title(f'IR Wavelength: Aethalometer vs FTIR-EC\n(Regression-corrected: BC × 0.85 - 0.17, n={len(x_data)})', fontsize=14)
            
            # Statistics text
            stats_text = (f'R² = {r_value**2:.3f}\n'
                         f'Slope = {slope:.3f}\n'
                         f'Intercept = {intercept:.3f}\n'
                         f'p-value = {p_value:.2e}')
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                   verticalalignment='top', fontsize=10)
            
            ax.legend()
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
else:
    print("❌ Cannot create IR plots - missing required data")

In [None]:
# Red Wavelength Scatter Plots with Fe Color-coding
print("📊 Creating Red Wavelength Scatter Plots with Fe Color-coding")

if final_merged is not None and 'EC_ftir' in final_merged.columns:
    
    # Get data for Red wavelength
    ec_ftir = final_merged['EC_ftir'].dropna()
    
    # Check if we have Iron data for color-coding
    use_fe_coloring = 'Iron' in final_merged.columns and final_merged['Iron'].notna().sum() > 5
    
    if use_fe_coloring:
        print("🔬 Using Iron concentration for color-coding")
        colormap = 'plasma'
        color_label = 'Iron (μg/m³)'
    else:
        print("⚠️ No Iron data - using default coloring")
        colormap = None
        color_label = None
    
    # Plot 1: Ethiopia-corrected Red BC vs FTIR EC
    if 'Red_BC_ug' in final_merged.columns:
        red_bc_ug = final_merged['Red_BC_ug'].dropna()
        common_idx = red_bc_ug.index.intersection(ec_ftir.index)
        
        if len(common_idx) > 3:
            x_data = red_bc_ug.loc[common_idx]
            y_data = ec_ftir.loc[common_idx]
            
            # Get Iron data for coloring if available
            if use_fe_coloring:
                color_data = final_merged.loc[common_idx, 'Iron']
                # Remove any NaN Iron values
                valid_color_idx = color_data.notna()
                x_data = x_data[valid_color_idx]
                y_data = y_data[valid_color_idx]
                color_data = color_data[valid_color_idx]
            else:
                color_data = None
            
            # Calculate regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_data, y_data)
            
            # Create plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            if use_fe_coloring and color_data is not None:
                scatter = ax.scatter(x_data, y_data, c=color_data, cmap=colormap, 
                                   alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label(color_label, fontsize=12)
            else:
                ax.scatter(x_data, y_data, alpha=0.7, color='red', s=60, 
                          edgecolor='black', linewidth=0.5)
            
            # Add regression line
            line_x = np.linspace(x_data.min(), x_data.max(), 100)
            line_y = slope * line_x + intercept
            ax.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                   label=f'y = {slope:.3f}x + {intercept:.3f}')
            
            # Add 1:1 line
            max_val = max(x_data.max(), y_data.max())
            min_val = min(x_data.min(), y_data.min())
            ax.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
            
            ax.set_xlabel('Ethiopia-Corrected Red BC (μg/m³)', fontsize=12)
            ax.set_ylabel('FTIR EC (μg/m³)', fontsize=12)
            ax.set_title(f'Red Wavelength: Aethalometer vs FTIR-EC\n(Ethiopia-corrected, n={len(x_data)})', fontsize=14)
            
            # Statistics text
            stats_text = (f'R² = {r_value**2:.3f}\n'
                         f'Slope = {slope:.3f}\n'
                         f'Intercept = {intercept:.3f}\n'
                         f'p-value = {p_value:.2e}')
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                   verticalalignment='top', fontsize=10)
            
            ax.legend()
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
    
    # Plot 2: Regression-corrected Red BC vs FTIR EC
    if 'Red_BC_regression_corrected' in final_merged.columns:
        red_bc_regress = final_merged['Red_BC_regression_corrected'].dropna()
        common_idx = red_bc_regress.index.intersection(ec_ftir.index)
        
        if len(common_idx) > 3:
            x_data = red_bc_regress.loc[common_idx]
            y_data = ec_ftir.loc[common_idx]
            
            # Get Iron data for coloring if available
            if use_fe_coloring:
                color_data = final_merged.loc[common_idx, 'Iron']
                # Remove any NaN Iron values
                valid_color_idx = color_data.notna()
                x_data = x_data[valid_color_idx]
                y_data = y_data[valid_color_idx]
                color_data = color_data[valid_color_idx]
            else:
                color_data = None
            
            # Calculate regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_data, y_data)
            
            # Create plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            if use_fe_coloring and color_data is not None:
                scatter = ax.scatter(x_data, y_data, c=color_data, cmap=colormap, 
                                   alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label(color_label, fontsize=12)
            else:
                ax.scatter(x_data, y_data, alpha=0.7, color='darkred', s=60, 
                          edgecolor='black', linewidth=0.5)
            
            # Add regression line
            line_x = np.linspace(x_data.min(), x_data.max(), 100)
            line_y = slope * line_x + intercept
            ax.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                   label=f'y = {slope:.3f}x + {intercept:.3f}')
            
            # Add 1:1 line
            max_val = max(x_data.max(), y_data.max())
            min_val = min(x_data.min(), y_data.min())
            ax.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
            
            ax.set_xlabel('Regression-Corrected Red BC (μg/m³)', fontsize=12)
            ax.set_ylabel('FTIR EC (μg/m³)', fontsize=12)
            ax.set_title(f'Red Wavelength: Aethalometer vs FTIR-EC\n(Regression-corrected: BC × 0.85 - 0.17, n={len(x_data)})', fontsize=14)
            
            # Statistics text
            stats_text = (f'R² = {r_value**2:.3f}\n'
                         f'Slope = {slope:.3f}\n'
                         f'Intercept = {intercept:.3f}\n'
                         f'p-value = {p_value:.2e}')
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                   verticalalignment='top', fontsize=10)
            
            ax.legend()
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
else:
    print("❌ Cannot create Red plots - missing required data")

In [None]:
# Aethalometer vs HIPS Comparison Plots
print("📊 Creating Aethalometer vs HIPS Comparison Plots")

# Note: HIPS data would typically be BC_PM25 from speciation data
if final_merged is not None and 'BC_PM25' in final_merged.columns:
    
    hips_bc = final_merged['BC_PM25'].dropna()
    print(f"🔬 HIPS BC data available: {len(hips_bc)} samples")
    
    # Check if we have Iron data for color-coding
    use_fe_coloring = 'Iron' in final_merged.columns and final_merged['Iron'].notna().sum() > 5
    
    if use_fe_coloring:
        print("🔬 Using Iron concentration for color-coding")
        colormap = 'coolwarm'
        color_label = 'Iron (μg/m³)'
    else:
        print("⚠️ No Iron data - using default coloring")
        colormap = None
        color_label = None
    
    # Plot 1: Ethiopia-corrected IR BC vs HIPS BC
    if 'IR_BC_ug' in final_merged.columns:
        ir_bc_ug = final_merged['IR_BC_ug'].dropna()
        common_idx = ir_bc_ug.index.intersection(hips_bc.index)
        
        if len(common_idx) > 3:
            x_data = ir_bc_ug.loc[common_idx]
            y_data = hips_bc.loc[common_idx]
            
            # Get Iron data for coloring if available
            if use_fe_coloring:
                color_data = final_merged.loc[common_idx, 'Iron']
                # Remove any NaN Iron values
                valid_color_idx = color_data.notna()
                x_data = x_data[valid_color_idx]
                y_data = y_data[valid_color_idx]
                color_data = color_data[valid_color_idx]
            else:
                color_data = None
            
            # Calculate regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_data, y_data)
            
            # Create plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            if use_fe_coloring and color_data is not None:
                scatter = ax.scatter(x_data, y_data, c=color_data, cmap=colormap, 
                                   alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label(color_label, fontsize=12)
            else:
                ax.scatter(x_data, y_data, alpha=0.7, color='green', s=60, 
                          edgecolor='black', linewidth=0.5)
            
            # Add regression line
            line_x = np.linspace(x_data.min(), x_data.max(), 100)
            line_y = slope * line_x + intercept
            ax.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                   label=f'y = {slope:.3f}x + {intercept:.3f}')
            
            # Add 1:1 line
            max_val = max(x_data.max(), y_data.max())
            min_val = min(x_data.min(), y_data.min())
            ax.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
            
            ax.set_xlabel('Ethiopia-Corrected IR BC (μg/m³)', fontsize=12)
            ax.set_ylabel('HIPS BC (μg/m³)', fontsize=12)
            ax.set_title(f'Aethalometer vs HIPS BC Comparison\n(Ethiopia-corrected, n={len(x_data)})', fontsize=14)
            
            # Statistics text
            stats_text = (f'R² = {r_value**2:.3f}\n'
                         f'Slope = {slope:.3f}\n'
                         f'Intercept = {intercept:.3f}\n'
                         f'p-value = {p_value:.2e}')
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                   verticalalignment='top', fontsize=10)
            
            ax.legend()
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
    
    # Plot 2: Regression-corrected IR BC vs HIPS BC
    if 'IR_BC_regression_corrected' in final_merged.columns:
        ir_bc_regress = final_merged['IR_BC_regression_corrected'].dropna()
        common_idx = ir_bc_regress.index.intersection(hips_bc.index)
        
        if len(common_idx) > 3:
            x_data = ir_bc_regress.loc[common_idx]
            y_data = hips_bc.loc[common_idx]
            
            # Get Iron data for coloring if available
            if use_fe_coloring:
                color_data = final_merged.loc[common_idx, 'Iron']
                # Remove any NaN Iron values
                valid_color_idx = color_data.notna()
                x_data = x_data[valid_color_idx]
                y_data = y_data[valid_color_idx]
                color_data = color_data[valid_color_idx]
            else:
                color_data = None
            
            # Calculate regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(x_data, y_data)
            
            # Create plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            if use_fe_coloring and color_data is not None:
                scatter = ax.scatter(x_data, y_data, c=color_data, cmap=colormap, 
                                   alpha=0.7, s=60, edgecolor='black', linewidth=0.5)
                cbar = plt.colorbar(scatter, ax=ax)
                cbar.set_label(color_label, fontsize=12)
            else:
                ax.scatter(x_data, y_data, alpha=0.7, color='darkgreen', s=60, 
                          edgecolor='black', linewidth=0.5)
            
            # Add regression line
            line_x = np.linspace(x_data.min(), x_data.max(), 100)
            line_y = slope * line_x + intercept
            ax.plot(line_x, line_y, 'r--', alpha=0.8, linewidth=2, 
                   label=f'y = {slope:.3f}x + {intercept:.3f}')
            
            # Add 1:1 line
            max_val = max(x_data.max(), y_data.max())
            min_val = min(x_data.min(), y_data.min())
            ax.plot([min_val, max_val], [min_val, max_val], 'k:', alpha=0.5, label='1:1 line')
            
            ax.set_xlabel('Regression-Corrected IR BC (μg/m³)', fontsize=12)
            ax.set_ylabel('HIPS BC (μg/m³)', fontsize=12)
            ax.set_title(f'Aethalometer vs HIPS BC Comparison\n(Regression-corrected: BC × 0.85 - 0.17, n={len(x_data)})', fontsize=14)
            
            # Statistics text
            stats_text = (f'R² = {r_value**2:.3f}\n'
                         f'Slope = {slope:.3f}\n'
                         f'Intercept = {intercept:.3f}\n'
                         f'p-value = {p_value:.2e}')
            ax.text(0.05, 0.95, stats_text, transform=ax.transAxes,
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                   verticalalignment='top', fontsize=10)
            
            ax.legend()
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
else:
    print("❌ Cannot create HIPS comparison plots - BC_PM25 data not available")
    if final_merged is not None:
        available_bc_cols = [col for col in final_merged.columns if 'BC' in col]
        print(f"Available BC columns: {available_bc_cols}")

In [None]:
# Summary and save results
print("📋 SPARTAN SPECIATION ANALYSIS SUMMARY")
print("=" * 50)

if final_merged is not None:
    print(f"🎉 Analysis completed successfully!")
    print(f"")
    print(f"📊 Final dataset: {final_merged.shape}")
    print(f"📅 Date range: {final_merged.index.min()} to {final_merged.index.max()}")
    
    # Count data availability
    ftir_count = final_merged['EC_ftir'].notna().sum() if 'EC_ftir' in final_merged.columns else 0
    speciation_count = final_merged['Iron'].notna().sum() if 'Iron' in final_merged.columns else 0
    bc_pm25_count = final_merged['BC_PM25'].notna().sum() if 'BC_PM25' in final_merged.columns else 0
    
    print(f"")
    print(f"📈 Data Availability:")
    print(f"  🧪 FTIR EC: {ftir_count} samples")
    print(f"  🔬 Iron (for color-coding): {speciation_count} samples")
    print(f"  🧬 HIPS BC_PM25: {bc_pm25_count} samples")
    
    # Show seasonal distribution if available
    if 'season' in final_merged.columns:
        season_counts = final_merged['season'].value_counts()
        print(f"")
        print(f"🌍 Ethiopian Seasonal Distribution:")
        for season, count in season_counts.items():
            print(f"  {season}: {count} samples")
    
    # Save the final dataset
    output_file = "SPARTAN_Speciation_Merged_Analysis.pkl"
    final_merged.to_pickle(output_file)
    print(f"\n💾 Saved final dataset to: {output_file}")
    
    # Save as CSV for easy viewing
    csv_file = "SPARTAN_Speciation_Merged_Analysis.csv"
    final_merged.to_csv(csv_file)
    print(f"💾 Saved CSV version to: {csv_file}")
    
    print(f"")
    print(f"🎯 Analysis Features Completed:")
    print(f"  ✅ SPARTAN speciation data integration")
    print(f"  ✅ 9AM-to-9AM daily averaging")
    print(f"  ✅ Ethiopia loading effect corrections")
    print(f"  ✅ Linear regression corrections (BC × 0.85 - 0.17)")
    print(f"  ✅ Iron-concentration color-coding")
    print(f"  ✅ IR wavelength scatter plots")
    print(f"  ✅ Red wavelength scatter plots")
    print(f"  ✅ HIPS comparison plots")
    print(f"  ✅ Ethiopian seasonal classification")
    
    print(f"")
    print(f"📚 Available Chemical Species:")
    spec_cols = [col for col in final_merged.columns if any(x in col for x in 
                ['Ion', 'Iron', 'Aluminum', 'Potassium', 'Sulfate', 'Nitrate'])]
    print(f"  🧪 {len(spec_cols)} species: {spec_cols[:8]}..." if len(spec_cols) > 8 else f"  🧪 {spec_cols}")
    
else:
    print(f"❌ Analysis could not be completed - check data loading steps")

print(f"\n🏁 SPARTAN Speciation Analysis Complete!")