In [2]:
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import enhanced setup with PKL cleaning capabilities
from notebook_utils.pkl_cleaning_integration import create_enhanced_setup
from config.notebook_config import NotebookConfig
from data.qc.pkl_cleaning import PKLDataCleaner

# ADD THIS: Import calibration module
import importlib.util
try:
    # Try to import calibration module (adjust path as needed)
    module_path = os.path.join(os.path.dirname(os.getcwd()), "src", "external", "calibration.py")
    if not os.path.exists(module_path):
        # Alternative paths to try
        alt_paths = [
            os.path.join(os.getcwd(), "calibration.py"),
            os.path.join(os.path.dirname(os.getcwd()), "calibration.py"),
            "calibration.py"
        ]
        for alt_path in alt_paths:
            if os.path.exists(alt_path):
                module_path = alt_path
                break
    
    if os.path.exists(module_path):
        module_name = "calibration"
        spec = importlib.util.spec_from_file_location(module_name, module_path)
        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module
        spec.loader.exec_module(module)
        import calibration
        print("✅ Calibration module loaded successfully")
        HAS_CALIBRATION = True
    else:
        print("⚠️ Calibration module not found, will use alternative preprocessing")
        HAS_CALIBRATION = False
except Exception as e:
    print(f"⚠️ Could not import calibration module: {e}")
    HAS_CALIBRATION = False

# Your existing configuration
config = NotebookConfig(
    site_code='ETAD',
    wavelength='Red',
    quality_threshold=10,
    output_format='jpl',
    min_samples_for_analysis=30,
    confidence_level=0.95,
    outlier_threshold=3.0,
    figure_size=(12, 8),
    font_size=10,
    dpi=300
)

# Set your data paths (same as before)
base_data_path = "/Users/ahzs645/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data"

config.aethalometer_files = {
    'pkl_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Kyan Data/Mergedcleaned and uncleaned MA350 data20250707030704",
        "df_uncleaned_Jacros_API_and_OG.pkl"
    ),
    'csv_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Raw",
        "Jacros_MA350_1-min_2022-2024_Cleaned.csv"
    )
}

config.ftir_db_path = os.path.join(
    base_data_path,
    "EC-HIPS-Aeth Comparison/Data/Original Data/Combined Database",
    "spartan_ftir_hips.db"
)

# Create enhanced setup with PKL cleaning capabilities
setup = create_enhanced_setup(config)

✅ Calibration module loaded successfully
✅ Advanced plotting style configured
🚀 Aethalometer-FTIR/HIPS Pipeline with Simplified Setup
📊 Configuration Summary:
   Site: ETAD
   Wavelength: Red
   Output format: jpl
   Quality threshold: 10 minutes
   Output directory: outputs

📁 File paths:
   pkl_data: ✅ df_uncleaned_Jacros_API_and_OG.pkl
   csv_data: ✅ Jacros_MA350_1-min_2022-2024_Cleaned.csv
   FTIR DB: ✅ spartan_ftir_hips.db
🧹 Enhanced setup with PKL cleaning capabilities loaded


In [3]:
print("📁 Loading datasets...")
datasets = setup.load_all_data()

# Get PKL data and fix datetime_local issue
pkl_data_original = setup.get_dataset('pkl_data')

# Quick fix for datetime_local issue
if 'datetime_local' not in pkl_data_original.columns:
    if pkl_data_original.index.name == 'datetime_local':
        print("✅ Converting datetime_local from index to column...")
        pkl_data_original = pkl_data_original.reset_index()
    elif hasattr(pkl_data_original.index, 'tz'):
        print("✅ Creating datetime_local column from datetime index...")
        pkl_data_original['datetime_local'] = pkl_data_original.index
        pkl_data_original = pkl_data_original.reset_index(drop=True)

print(f"📊 PKL data ready: {pkl_data_original.shape}")
print(f"📅 Date range: {pkl_data_original['datetime_local'].min()} to {pkl_data_original['datetime_local'].max()}")

📁 Loading datasets...
📦 Setting up modular system...
✅ Aethalometer loaders imported
✅ Database loader imported
✅ Plotting utilities imported
✅ Plotting style configured
✅ Successfully imported 5 modular components

📁 LOADING DATASETS
📁 Loading all datasets...

📊 Loading pkl_data
📁 Loading pkl_data: df_uncleaned_Jacros_API_and_OG.pkl
Detected format: standard
Set 'datetime_local' as DatetimeIndex for time series operations
Converted 17 columns to JPL format
✅ Modular load: 1,665,156 rows × 238 columns
📊 Method: modular
📊 Format: jpl
📊 Memory: 7443.05 MB
🧮 BC columns: 30
📈 ATN columns: 25
📅 Time range: 2021-01-09 16:38:00 to 2025-06-26 23:18:00
✅ pkl_data loaded successfully

📊 Loading csv_data
📁 Loading csv_data: Jacros_MA350_1-min_2022-2024_Cleaned.csv
Set 'Time (Local)' as DatetimeIndex for time series operations
Converted 5 columns to JPL format
✅ Modular load: 1,095,086 rows × 77 columns
📊 Method: modular
📊 Format: jpl
📊 Memory: 884.83 MB
🧮 BC columns: 15
📈 ATN columns: 10
📅 Time r

In [4]:
def comprehensive_preprocessing(df):
    """Apply all the preprocessing steps that the working pipeline includes"""
    print("🔧 Comprehensive Preprocessing Pipeline")
    print("=" * 60)
    
    df_processed = df.copy()
    original_size = len(df_processed)
    
    # Step 1: Fix datetime column
    print("Step 1: Processing datetime...")
    if 'datetime_local' in df_processed.columns:
        if not pd.api.types.is_datetime64_any_dtype(df_processed['datetime_local']):
            df_processed['datetime_local'] = pd.to_datetime(
                df_processed['datetime_local'], utc=True
            ).dt.tz_convert('Africa/Addis_Ababa')
            print("✅ Converted datetime_local to proper timezone")
    
    # Step 2: Column renaming (your existing logic but improved)
    print("\nStep 2: Fixing column names...")
    column_mapping = {}
    
    # Map BC columns (handle both BC1->BCc conversion and dot notation)
    for wl in ['IR', 'Blue', 'Green', 'Red', 'UV']:
        # First priority: use .BCc if available
        if f'{wl}.BCc' in df_processed.columns:
            column_mapping[f'{wl}.BCc'] = f'{wl} BCc'
        # Second priority: rename BC1 to BCc
        elif f'{wl} BC1' in df_processed.columns:
            df_processed = df_processed.rename(columns={f'{wl} BC1': f'{wl} BCc'})
            print(f"  Renamed {wl} BC1 -> {wl} BCc")
    
    # Map ATN columns (dots to spaces)
    for wl in ['IR', 'Blue', 'Green', 'Red', 'UV']:
        for spot in [1, 2]:
            if f'{wl}.ATN{spot}' in df_processed.columns:
                column_mapping[f'{wl}.ATN{spot}'] = f'{wl} ATN{spot}'
    
    # Map flow columns
    if 'Flow.total.mL.min' in df_processed.columns:
        column_mapping['Flow.total.mL.min'] = 'Flow total (mL/min)'
    
    # Apply column renaming
    if column_mapping:
        df_processed = df_processed.rename(columns=column_mapping)
        print(f"✅ Renamed {len(column_mapping)} columns")
    
    # Step 3: Data type conversion
    print("\nStep 3: Converting data types...")
    if HAS_CALIBRATION:
        df_processed = calibration.convert_to_float(df_processed)
        print("✅ Applied calibration.convert_to_float()")
    else:
        # Manual data type conversion
        numeric_cols = []
        for col in df_processed.columns:
            if any(x in col for x in ['ATN', 'BC', 'Flow', 'temp', 'Temp']):
                if df_processed[col].dtype == 'object':
                    try:
                        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
                        numeric_cols.append(col)
                    except:
                        pass
        print(f"✅ Converted {len(numeric_cols)} columns to numeric")
    
    # Step 4: Add Session ID
    print("\nStep 4: Adding Session ID...")
    if 'Session ID' not in df_processed.columns and 'Tape position' in df_processed.columns:
        position_change = df_processed['Tape position'] != df_processed['Tape position'].shift()
        df_processed['Session ID'] = position_change.cumsum()
        print("✅ Added Session ID based on tape position changes")
    
    # Step 5: Add delta calculations
    print("\nStep 5: Adding delta calculations...")
    if HAS_CALIBRATION:
        df_processed = calibration.add_deltas(df_processed)
        print("✅ Applied calibration.add_deltas()")
    else:
        # Manual delta calculation for critical columns
        print("⚠️ Manual delta calculation (limited functionality)")
        # Add basic delta calculations for ATN columns
        attn_cols = [col for col in df_processed.columns if 'ATN' in col and col.count(' ') == 1]
        for col in attn_cols:
            try:
                if 'Serial number' in df_processed.columns and 'Session ID' in df_processed.columns:
                    df_processed[f'delta {col}'] = (
                        df_processed.groupby(['Serial number', 'Session ID'])[col].diff()
                    )
                else:
                    df_processed[f'delta {col}'] = df_processed[col].diff()
            except:
                pass
        print(f"✅ Added basic delta calculations for {len(attn_cols)} ATN columns")
    
    # Step 6: Set serial number and filter by year
    print("\nStep 6: Final adjustments...")
    df_processed['Serial number'] = "MA350-0238"
    
    if 'datetime_local' in df_processed.columns:
        df_processed = df_processed.loc[df_processed['datetime_local'].dt.year >= 2022]
        print(f"✅ Filtered to 2022+: {original_size:,} -> {len(df_processed):,} rows")
    
    return df_processed

# Apply comprehensive preprocessing
pkl_data_preprocessed = comprehensive_preprocessing(pkl_data_original)


🔧 Comprehensive Preprocessing Pipeline
Step 1: Processing datetime...

Step 2: Fixing column names...
✅ Renamed 16 columns

Step 3: Converting data types...
Converted IR ATN1 to float.
Converted UV ATN1 to float.
Converted Blue ATN1 to float.
Converted Green ATN1 to float.
Converted Red ATN1 to float.
✅ Applied calibration.convert_to_float()

Step 4: Adding Session ID...

Step 5: Adding delta calculations...
✅ Applied calibration.add_deltas()

Step 6: Final adjustments...
✅ Filtered to 2022+: 1,665,156 -> 1,627,058 rows


In [5]:
def apply_dema_smoothing_working(df, wavelengths=['IR', 'Blue']):
    """Apply DEMA smoothing that actually works"""
    print("🔄 Applying DEMA Smoothing...")
    print("=" * 40)
    
    df_smoothed = df.copy()
    
    for wl in wavelengths:
        print(f"\nProcessing {wl} wavelength...")
        
        # Check what BC columns we have
        bc_cols = [col for col in df_smoothed.columns if wl in col and 'BC' in col and 'smoothed' not in col]
        print(f"  Available BC columns: {bc_cols}")
        
        if not bc_cols:
            print(f"  ⚠️ No BC columns found for {wl}")
            continue
        
        # Process each BC column
        for bc_col in bc_cols:
            try:
                # Group by measurement sessions for proper smoothing
                groupby_cols = ['Serial number']
                if 'Session ID' in df_smoothed.columns:
                    groupby_cols.append('Session ID')
                if 'Tape position' in df_smoothed.columns:
                    groupby_cols.append('Tape position')
                
                smoothed_values = []
                
                for group_keys, group in df_smoothed.groupby(groupby_cols):
                    if len(group) > 2:  # Need at least 3 points for smoothing
                        values = group[bc_col].dropna()
                        if len(values) > 1:
                            # Apply DEMA algorithm
                            span = min(10, len(values) // 2)  # Adaptive span
                            if span < 2:
                                span = 2
                            
                            # First EMA
                            ema1 = values.ewm(span=span, adjust=False).mean()
                            # Second EMA (EMA of EMA)
                            ema2 = ema1.ewm(span=span, adjust=False).mean()
                            # DEMA = 2*EMA1 - EMA2
                            dema = 2 * ema1 - ema2
                            
                            # Store results with original indices
                            for idx, val in dema.items():
                                smoothed_values.append((idx, val))
                        else:
                            # Not enough data, use original values
                            for idx, val in values.items():
                                smoothed_values.append((idx, val))
                    else:
                        # Very small group, use original values
                        values = group[bc_col].dropna()
                        for idx, val in values.items():
                            smoothed_values.append((idx, val))
                
                # Create smoothed column
                smoothed_col = f'{bc_col} smoothed'
                df_smoothed[smoothed_col] = np.nan
                
                for idx, val in smoothed_values:
                    df_smoothed.loc[idx, smoothed_col] = val
                
                print(f"  ✅ Created {smoothed_col}")
                
            except Exception as e:
                print(f"  ⚠️ Failed to smooth {bc_col}: {e}")
    
    return df_smoothed

# Apply DEMA smoothing
pkl_data_with_smoothing = apply_dema_smoothing_working(pkl_data_preprocessed, ['IR', 'Blue'])


🔄 Applying DEMA Smoothing...

Processing IR wavelength...
  Available BC columns: ['IR BC1', 'IR BC2', 'IR BCc']
  ✅ Created IR BC1 smoothed
  ✅ Created IR BC2 smoothed
  ✅ Created IR BCc smoothed

Processing Blue wavelength...
  Available BC columns: ['Blue BC1', 'Blue BC2', 'Blue BCc']
  ✅ Created Blue BC1 smoothed
  ✅ Created Blue BC2 smoothed
  ✅ Created Blue BCc smoothed


In [6]:
print("\n🧹 Final Cleaning Pipeline")
print("=" * 60)

try:
    # Initialize cleaner
    cleaner = PKLDataCleaner(wavelengths_to_filter=['IR', 'Blue'], verbose=True)
    
    # Apply cleaning pipeline, skipping preprocessing since we did it comprehensively
    pkl_data_cleaned = cleaner.clean_pipeline(pkl_data_with_smoothing, skip_preprocessing=True)
    
    print("\n📊 Cleaning Results Summary:")
    print("=" * 60)
    print(f"Original data points: {len(pkl_data_original):,}")
    print(f"After preprocessing: {len(pkl_data_preprocessed):,}")
    print(f"After smoothing: {len(pkl_data_with_smoothing):,}")
    print(f"Final cleaned: {len(pkl_data_cleaned):,}")
    
    total_removed = len(pkl_data_original) - len(pkl_data_cleaned)
    removal_pct = (total_removed / len(pkl_data_original) * 100)
    print(f"Total removed: {total_removed:,} ({removal_pct:.2f}%)")
    
    print("\n✅ PKL data cleaning completed successfully!")
    
    # Quick verification
    print(f"\n📊 Final data verification:")
    print(f"Shape: {pkl_data_cleaned.shape}")
    if 'datetime_local' in pkl_data_cleaned.columns:
        print(f"Date range: {pkl_data_cleaned['datetime_local'].min()} to {pkl_data_cleaned['datetime_local'].max()}")
    
    # Check for key columns
    key_cols = ['IR ATN1', 'IR BCc', 'Blue ATN1', 'Blue BCc', 'Flow total (mL/min)']
    for col in key_cols:
        status = "✅" if col in pkl_data_cleaned.columns else "❌"
        print(f"  {status} {col}")
    
    # Check smoothed columns
    smoothed_cols = [col for col in pkl_data_cleaned.columns if 'smoothed' in col]
    print(f"  ✅ Smoothed columns: {len(smoothed_cols)}")
    
except Exception as e:
    print(f"❌ Final cleaning failed: {e}")
    print("Using preprocessed and smoothed data as fallback")
    pkl_data_cleaned = pkl_data_with_smoothing


🧹 Final Cleaning Pipeline
Starting PKL data cleaning pipeline...
🔍 Data Structure Diagnosis:
------------------------------
DataFrame shape: (1627058, 284)
Date range: 2022-04-12 09:12:00 to 2025-06-26 23:18:00
BC columns: 15 (e.g., ['Blue BC1', 'Blue BC2', 'Blue BCc'])
BC smoothed columns: 6 (e.g., ['IR BC1 smoothed', 'IR BC2 smoothed', 'IR BCc smoothed'])
ATN columns: 40 (e.g., ['Blue ATN1', 'Blue ATN2', 'Green ATN1'])
Flow columns: 4 (e.g., ['Flow setpoint (mL/min)', 'Flow total (mL/min)', 'Flow1 (mL/min)'])

Targeted wavelengths: ['IR', 'Blue']
  IR: ✅ BC | ✅ BC smoothed | ✅ ATN
  Blue: ✅ BC | ✅ BC smoothed | ✅ ATN
------------------------------

🧹 Starting cleaning steps...
1919 datapoints removed due to Start up or Tape advance status
Statuses of concern, count by device and status:

MA350-0238 Flow unstable 750
MA350-0238 Optical saturation 0
MA350-0238 Sample timing error 0
Number of datapoints with invalid optics values
AFTER dropping data with 'Optical saturation' status val

In [7]:
if 'pkl_data_cleaned' in locals() and len(pkl_data_cleaned) > 0:
    # Export cleaned data
    output_csv = 'pkl_data_cleaned_working.csv'
    output_pkl = 'pkl_data_cleaned_working.pkl'
    
    pkl_data_cleaned.to_csv(output_csv, index=False)
    pkl_data_cleaned.to_pickle(output_pkl)
    
    print(f"\n💾 Cleaned data exported:")
    print(f"  📄 CSV: {output_csv}")
    print(f"  📦 Pickle: {output_pkl}")
    
    print(f"\n🎉 PKL Data Cleaning Complete!")
    print(f"📊 Final shape: {pkl_data_cleaned.shape}")
    print(f"📅 Date range: {pkl_data_cleaned['datetime_local'].min()} to {pkl_data_cleaned['datetime_local'].max()}")
    print("🚀 Ready for further analysis!")
    
else:
    print("❌ No cleaned data available to export")
    print("Please check the error messages above")


💾 Cleaned data exported:
  📄 CSV: pkl_data_cleaned_working.csv
  📦 Pickle: pkl_data_cleaned_working.pkl

🎉 PKL Data Cleaning Complete!
📊 Final shape: (1477783, 293)
📅 Date range: 2022-04-12 09:54:00 to 2025-06-26 23:18:00
🚀 Ready for further analysis!
