In [2]:
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import enhanced setup with PKL cleaning capabilities
from notebook_utils.pkl_cleaning_integration import create_enhanced_setup
from config.notebook_config import NotebookConfig
from data.qc.pkl_cleaning import PKLDataCleaner

# Your existing configuration
config = NotebookConfig(
    site_code='ETAD',
    wavelength='Red',
    quality_threshold=10,
    output_format='jpl',
    min_samples_for_analysis=30,
    confidence_level=0.95,
    outlier_threshold=3.0,
    figure_size=(12, 8),
    font_size=10,
    dpi=300
)

# Set your data paths (same as before)
base_data_path = "/Users/ahzs645/Library/CloudStorage/GoogleDrive-ahzs645@gmail.com/My Drive/University/Research/Grad/UC Davis Ann/NASA MAIA/Data"

config.aethalometer_files = {
    'pkl_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Kyan Data/Mergedcleaned and uncleaned MA350 data20250707030704",
        "df_uncleaned_Jacros_API_and_OG.pkl"
    ),
    'csv_data': os.path.join(
        base_data_path,
        "Aethelometry Data/Raw",
        "Jacros_MA350_1-min_2022-2024_Cleaned.csv"
    )
}

config.ftir_db_path = os.path.join(
    base_data_path,
    "EC-HIPS-Aeth Comparison/Data/Original Data/Combined Database",
    "spartan_ftir_hips.db"
)

# Create enhanced setup with PKL cleaning capabilities
setup = create_enhanced_setup(config)

✅ Advanced plotting style configured
🚀 Aethalometer-FTIR/HIPS Pipeline with Simplified Setup
📊 Configuration Summary:
   Site: ETAD
   Wavelength: Red
   Output format: jpl
   Quality threshold: 10 minutes
   Output directory: outputs

📁 File paths:
   pkl_data: ✅ df_uncleaned_Jacros_API_and_OG.pkl
   csv_data: ✅ Jacros_MA350_1-min_2022-2024_Cleaned.csv
   FTIR DB: ✅ spartan_ftir_hips.db
🧹 Enhanced setup with PKL cleaning capabilities loaded


In [3]:
# Load all datasets
print("📁 Loading datasets...")
datasets = setup.load_all_data()

# Get PKL data and fix datetime_local issue
pkl_data_original = setup.get_dataset('pkl_data')

# Quick fix for datetime_local issue
if 'datetime_local' not in pkl_data_original.columns:
    if pkl_data_original.index.name == 'datetime_local':
        print("✅ Converting datetime_local from index to column...")
        pkl_data_original = pkl_data_original.reset_index()
    elif hasattr(pkl_data_original.index, 'tz'):
        print("✅ Creating datetime_local column from datetime index...")
        pkl_data_original['datetime_local'] = pkl_data_original.index
        pkl_data_original = pkl_data_original.reset_index(drop=True)

print(f"📊 PKL data ready: {pkl_data_original.shape}")
print(f"📅 Date range: {pkl_data_original['datetime_local'].min()} to {pkl_data_original['datetime_local'].max()}")

📁 Loading datasets...
📦 Setting up modular system...
✅ Aethalometer loaders imported
✅ Database loader imported
✅ Plotting utilities imported
✅ Plotting style configured
✅ Successfully imported 5 modular components

📁 LOADING DATASETS
📁 Loading all datasets...

📊 Loading pkl_data
📁 Loading pkl_data: df_uncleaned_Jacros_API_and_OG.pkl
Detected format: standard
Set 'datetime_local' as DatetimeIndex for time series operations
Converted 17 columns to JPL format
✅ Modular load: 1,665,156 rows × 238 columns
📊 Method: modular
📊 Format: jpl
📊 Memory: 7443.05 MB
🧮 BC columns: 30
📈 ATN columns: 25
📅 Time range: 2021-01-09 16:38:00 to 2025-06-26 23:18:00
✅ pkl_data loaded successfully

📊 Loading csv_data
📁 Loading csv_data: Jacros_MA350_1-min_2022-2024_Cleaned.csv
Set 'Time (Local)' as DatetimeIndex for time series operations
Converted 5 columns to JPL format
✅ Modular load: 1,095,086 rows × 77 columns
📊 Method: modular
📊 Format: jpl
📊 Memory: 884.83 MB
🧮 BC columns: 15
📈 ATN columns: 10
📅 Time r

In [4]:
# Simple Troubleshooting Cell - Run this if you get errors

print("🔧 PKL Data Cleaning Troubleshooting")
print("=" * 50)

# Check data structure
if 'pkl_data_original' in locals():
    print(f"✅ Data loaded: {pkl_data_original.shape}")
    
    # Check datetime column
    if 'datetime_local' in pkl_data_original.columns:
        print("✅ datetime_local found in columns")
    elif pkl_data_original.index.name == 'datetime_local':
        print("⚠️ datetime_local is in index, converting...")
        pkl_data_original = pkl_data_original.reset_index()
        print("✅ Converted to column")
    else:
        print("❌ No datetime_local found")
        time_cols = [col for col in pkl_data_original.columns if 'time' in col.lower()]
        print(f"Available time columns: {time_cols}")
    
    # Check for required columns
    print("\n📊 Column availability check:")
    required_patterns = ['BC', 'ATN', 'Flow', 'Serial number', 'Tape position']
    for pattern in required_patterns:
        matching_cols = [col for col in pkl_data_original.columns if pattern in col]
        print(f"  {pattern}: {len(matching_cols)} columns found")
        if matching_cols and len(matching_cols) <= 3:
            print(f"    Examples: {matching_cols}")
    
    # Create cleaner with diagnostics
    print("\n🧹 Creating cleaner with diagnostics...")
    cleaner_test = PKLDataCleaner(wavelengths_to_filter=['IR', 'Blue'], verbose=True)
    
    # Test cleaner initialization
    try:
        cleaner_test.diagnose_data_structure(pkl_data_original)
        print("✅ Cleaner diagnostics completed")
        print("\n💡 You should now be able to run the cleaning pipeline")
    except Exception as e:
        print(f"❌ Diagnostics failed: {e}")
        print("Please check your data structure manually")

else:
    print("❌ pkl_data_original not found")
    print("Please run the data loading cells first")

print("\n" + "=" * 50)

🔧 PKL Data Cleaning Troubleshooting
✅ Data loaded: (1665156, 239)
✅ datetime_local found in columns

📊 Column availability check:
  BC: 15 columns found
  ATN: 10 columns found
  Flow: 4 columns found
  Serial number: 1 columns found
    Examples: ['Serial number']
  Tape position: 1 columns found
    Examples: ['Tape position']

🧹 Creating cleaner with diagnostics...
🔍 Data Structure Diagnosis:
------------------------------
DataFrame shape: (1665156, 239)
Date range: 2021-01-09 16:38:00 to 2025-06-26 23:18:00
BC columns: 10 (e.g., ['Blue BC1', 'Blue BC2', 'Green BC1'])
BC smoothed columns: 0 (e.g., None)
ATN columns: 0 (e.g., None)
Flow columns: 4 (e.g., ['Flow setpoint (mL/min)', 'Flow.total.mL.min', 'Flow1 (mL/min)'])

Targeted wavelengths: ['IR', 'Blue']
  IR: ❌ BC | ❌ BC smoothed | ❌ ATN
  Blue: ❌ BC | ❌ BC smoothed | ❌ ATN
------------------------------
✅ Cleaner diagnostics completed

💡 You should now be able to run the cleaning pipeline



In [5]:
# Initialize cleaner with verbose diagnostics
cleaner = PKLDataCleaner(wavelengths_to_filter=['IR', 'Blue'], verbose=True)

print("\n🧹 Starting PKL data cleaning with full diagnostics...")
print("=" * 60)

try:
    # Apply cleaning pipeline (now includes DEMA smoothing automatically)
    pkl_data_cleaned = cleaner.clean_pipeline(pkl_data_original.copy())
    
    print("\n📊 Cleaning Results Summary:")
    print("=" * 60)
    print(f"Original data points: {len(pkl_data_original):,}")
    print(f"Cleaned data points: {len(pkl_data_cleaned):,}")
    print(f"Total removed: {len(pkl_data_original) - len(pkl_data_cleaned):,}")
    print(f"Removal percentage: {((len(pkl_data_original) - len(pkl_data_cleaned)) / len(pkl_data_original) * 100):.2f}%")
    
    # Store cleaned data in setup
    setup.datasets['pkl_data_cleaned'] = pkl_data_cleaned
    
    print("✅ PKL data cleaning completed successfully!")
    
except Exception as e:
    print(f"❌ Cleaning failed: {e}")
    print("Please run the troubleshooting cell above and check the error details")



🧹 Starting PKL data cleaning with full diagnostics...
Starting PKL data cleaning pipeline...
🔄 Applying DEMA smoothing...
⚠️ Column Blue BCc not found, skipping
⚠️ Error in DEMA smoothing for Blue: index 0 is out of bounds for axis 0 with size 0
✅ DEMA smoothing applied for Blue
⚠️ Column Green BCc not found, skipping
⚠️ Error in DEMA smoothing for Green: index 0 is out of bounds for axis 0 with size 0
⚠️ Column Red BCc not found, skipping
⚠️ Error in DEMA smoothing for Red: index 0 is out of bounds for axis 0 with size 0
⚠️ Column UV BCc not found, skipping
⚠️ Error in DEMA smoothing for UV: index 0 is out of bounds for axis 0 with size 0
⚠️ Column IR BCc not found, skipping
⚠️ Error in DEMA smoothing for IR: index 0 is out of bounds for axis 0 with size 0
✅ DEMA smoothing applied for IR
🔍 Data Structure Diagnosis:
------------------------------
DataFrame shape: (1665156, 239)
Date range: 2021-01-09 16:38:00 to 2025-06-26 23:18:00
BC columns: 10 (e.g., ['Blue BC1', 'Blue BC2', 'Green

In [6]:
# Column Mapping and Debug Tool
# Run this to understand and fix column naming issues

def analyze_column_patterns(df):
    """
    Analyze column patterns in the loaded data to understand the naming convention.
    
    Args:
        df (pd.DataFrame): Loaded DataFrame
    """
    print("🔍 Column Pattern Analysis")
    print("=" * 50)
    print(f"Total columns: {len(df.columns)}")
    
    # Group columns by pattern
    patterns = {
        'BC_related': [],
        'ATN_related': [],
        'Flow_related': [],
        'Temperature_related': [],
        'Time_related': [],
        'Status_related': [],
        'Other': []
    }
    
    for col in df.columns:
        col_lower = col.lower()
        if 'bc' in col_lower:
            patterns['BC_related'].append(col)
        elif 'atn' in col_lower:
            patterns['ATN_related'].append(col)
        elif 'flow' in col_lower:
            patterns['Flow_related'].append(col)
        elif 'temp' in col_lower or 'temperature' in col_lower:
            patterns['Temperature_related'].append(col)
        elif 'time' in col_lower or 'date' in col_lower:
            patterns['Time_related'].append(col)
        elif 'status' in col_lower or 'error' in col_lower:
            patterns['Status_related'].append(col)
        else:
            patterns['Other'].append(col)
    
    for pattern_name, cols in patterns.items():
        if cols:
            print(f"\n{pattern_name}: {len(cols)} columns")
            # Show first 5 columns as examples
            for i, col in enumerate(cols[:5]):
                print(f"  {col}")
            if len(cols) > 5:
                print(f"  ... and {len(cols) - 5} more")
    
    return patterns

def create_column_mapping(df):
    """
    Create a mapping from expected column names to actual column names.
    
    Args:
        df (pd.DataFrame): Loaded DataFrame
        
    Returns:
        dict: Mapping of expected -> actual column names
    """
    print("\n🗺️ Creating Column Mapping")
    print("=" * 50)
    
    mapping = {}
    reverse_mapping = {}  # actual -> expected
    
    # Wavelengths we're interested in
    wavelengths = ['IR', 'Blue', 'Green', 'Red', 'UV']
    
    # Look for BC columns
    print("BC Columns:")
    for wl in wavelengths:
        # Look for different possible patterns
        possible_patterns = [
            f'{wl} BCc',      # Expected format
            f'{wl} BC1',      # Alternative 1
            f'{wl} BC2',      # Alternative 2
            f'{wl}.BCc',      # With dot separator
            f'{wl}.BC1',      # With dot separator
            f'{wl}.BC2',      # With dot separator
        ]
        
        for pattern in possible_patterns:
            if pattern in df.columns:
                if f'{wl} BCc' not in mapping:  # Prefer BCc if available
                    mapping[f'{wl} BCc'] = pattern
                    reverse_mapping[pattern] = f'{wl} BCc'
                    print(f"  {wl} BCc -> {pattern}")
                    break
    
    # Look for ATN columns
    print("\nATN Columns:")
    for wl in wavelengths:
        for spot in [1, 2]:
            expected = f'{wl} ATN{spot}'
            possible_patterns = [
                f'{wl} ATN{spot}',
                f'{wl}.ATN{spot}',
                f'{wl}_ATN{spot}',
            ]
            
            for pattern in possible_patterns:
                if pattern in df.columns:
                    mapping[expected] = pattern
                    reverse_mapping[pattern] = expected
                    print(f"  {expected} -> {pattern}")
                    break
    
    # Look for Flow columns
    print("\nFlow Columns:")
    flow_mappings = {
        'Flow total (mL/min)': ['Flow.total.mL.min', 'Flow total (mL/min)', 'Flow_total_mL_min'],
        'Flow1 (mL/min)': ['Flow1 (mL/min)', 'Flow1.mL.min', 'Flow1_mL_min'],
        'Flow2 (mL/min)': ['Flow2 (mL/min)', 'Flow2.mL.min', 'Flow2_mL_min'],
    }
    
    for expected, possibles in flow_mappings.items():
        for possible in possibles:
            if possible in df.columns:
                mapping[expected] = possible
                reverse_mapping[possible] = expected
                print(f"  {expected} -> {possible}")
                break
    
    # Look for Temperature columns
    print("\nTemperature Columns:")
    temp_mappings = {
        'Sample temp (C)': ['Sample temp (C)', 'Sample.temp.C', 'Sample_temp_C', 'Temperature'],
        'delta Sample temp (C)': ['delta Sample temp (C)', 'delta.Sample.temp.C', 'delta_Sample_temp_C']
    }
    
    for expected, possibles in temp_mappings.items():
        for possible in possibles:
            if possible in df.columns:
                mapping[expected] = possible
                reverse_mapping[possible] = expected
                print(f"  {expected} -> {possible}")
                break
    
    print(f"\nTotal mappings found: {len(mapping)}")
    return mapping, reverse_mapping

def rename_columns_for_cleaning(df, mapping):
    """
    Rename columns in DataFrame to match expected naming convention.
    
    Args:
        df (pd.DataFrame): DataFrame to rename
        mapping (dict): Column mapping (expected -> actual)
        
    Returns:
        pd.DataFrame: DataFrame with renamed columns
    """
    print("\n🔄 Renaming columns for cleaning compatibility...")
    
    # Create reverse mapping (actual -> expected)
    reverse_mapping = {v: k for k, v in mapping.items()}
    
    # Rename columns
    df_renamed = df.rename(columns=reverse_mapping)
    
    # Report what was renamed
    renamed_count = 0
    for actual, expected in reverse_mapping.items():
        if actual in df.columns:
            print(f"  {actual} -> {expected}")
            renamed_count += 1
    
    print(f"Renamed {renamed_count} columns")
    return df_renamed

# Run the analysis
if 'pkl_data_original' in locals():
    print("🔍 Analyzing your PKL data column structure...")
    patterns = analyze_column_patterns(pkl_data_original)
    
    print("\n" + "="*60)
    mapping, reverse_mapping = create_column_mapping(pkl_data_original)
    
    # Create a version with renamed columns
    print("\n" + "="*60)
    pkl_data_renamed = rename_columns_for_cleaning(pkl_data_original, mapping)
    
    print(f"\n✅ Column analysis complete!")
    print(f"Original shape: {pkl_data_original.shape}")
    print(f"Renamed shape: {pkl_data_renamed.shape}")
    print(f"Found mappings for {len(mapping)} expected columns")
    
    # Quick test to see what we have now
    print(f"\n📊 After renaming - checking for key columns:")
    for wl in ['IR', 'Blue']:
        bc_col = f'{wl} BCc'
        atn1_col = f'{wl} ATN1'
        atn2_col = f'{wl} ATN2'
        
        bc_status = "✅" if bc_col in pkl_data_renamed.columns else "❌"
        atn1_status = "✅" if atn1_col in pkl_data_renamed.columns else "❌"
        atn2_status = "✅" if atn2_col in pkl_data_renamed.columns else "❌"
        
        print(f"  {wl}: {bc_status} BCc | {atn1_status} ATN1 | {atn2_status} ATN2")
    
    flow_col = 'Flow total (mL/min)'
    flow_status = "✅" if flow_col in pkl_data_renamed.columns else "❌"
    print(f"  Flow: {flow_status} Flow total (mL/min)")
    
    print("\n💡 Use 'pkl_data_renamed' for cleaning if the mappings look good!")
    
else:
    print("❌ pkl_data_original not found. Please load your data first.")

🔍 Analyzing your PKL data column structure...
🔍 Column Pattern Analysis
Total columns: 239

BC_related: 30 columns
  Blue BC1
  Blue BC2
  Blue.BCc
  Green BC1
  Green BC2
  ... and 25 more

ATN_related: 25 columns
  Blue.ATN1
  Blue.ATN2
  Green.ATN1
  Green.ATN2
  IR.ATN1
  ... and 20 more

Flow_related: 8 columns
  Flow setpoint (mL/min)
  Flow.total.mL.min
  Flow1 (mL/min)
  Flow2 (mL/min)
  ma.flow.if1
  ... and 3 more

Temperature_related: 6 columns
  Internal temp (C)
  Sample temp (C)
  Sample temp (C) - scd
  co2.temperature
  ma.env.iinternal.temp
  ... and 1 more

Time_related: 12 columns
  datetime_local
  Timebase.s
  Timezone offset (mins)
  dateTime
  datetime
  ... and 7 more

Status_related: 5 columns
  Readable status
  Readable status - fw format
  Status
  lastStatus
  statusId

Other: 153 columns
  index
  Unnamed: 0
  Accel X
  Accel Y
  Accel Z
  ... and 148 more


🗺️ Creating Column Mapping
BC Columns:
  IR BCc -> IR BC1
  Blue BCc -> Blue BC1
  Green BCc -> Gre

In [7]:
# Working Solution Using Renamed Columns
# Use this after running the column mapping debug tool

print("🚀 Using renamed columns for PKL data cleaning")
print("=" * 60)

# First, let's verify we have the renamed data
if 'pkl_data_renamed' not in locals():
    print("❌ pkl_data_renamed not found. Please run the column mapping debug tool first.")
else:
    print(f"✅ Using pkl_data_renamed with shape: {pkl_data_renamed.shape}")
    
    # Let's also add any missing processing that might be needed
    df_for_cleaning = pkl_data_renamed.copy()
    
    # Ensure we have proper data types and any missing processing
    print("\n🔧 Applying additional preprocessing...")
    
    # Add Session ID if missing (from original script)
    if 'Session ID' not in df_for_cleaning.columns and 'Tape position' in df_for_cleaning.columns:
        position_change = df_for_cleaning['Tape position'] != df_for_cleaning['Tape position'].shift()
        df_for_cleaning['Session ID'] = position_change.cumsum()
        print("✅ Added Session ID")
    
    # Set serial number (from original script)
    df_for_cleaning['Serial number'] = "MA350-0238"
    
    # Filter to 2022 and later (from original script)
    if 'datetime_local' in df_for_cleaning.columns:
        df_for_cleaning = df_for_cleaning.loc[df_for_cleaning['datetime_local'].dt.year >= 2022]
        print(f"✅ Filtered to 2022+, shape now: {df_for_cleaning.shape}")
    
    # Now try cleaning with the properly named columns
    print("\n🧹 Initializing cleaner with renamed data...")
    
    try:
        # Initialize cleaner
        cleaner = PKLDataCleaner(wavelengths_to_filter=['IR', 'Blue'], verbose=True)
        
        # Apply cleaning pipeline
        print("\n🧹 Starting cleaning pipeline...")
        pkl_data_cleaned = cleaner.clean_pipeline(df_for_cleaning)
        
        print("\n📊 Cleaning Results Summary:")
        print("=" * 60)
        print(f"Original data points: {len(df_for_cleaning):,}")
        print(f"Cleaned data points: {len(pkl_data_cleaned):,}")
        print(f"Total removed: {len(df_for_cleaning) - len(pkl_data_cleaned):,}")
        removal_pct = ((len(df_for_cleaning) - len(pkl_data_cleaned)) / len(df_for_cleaning) * 100)
        print(f"Removal percentage: {removal_pct:.2f}%")
        
        print("\n✅ PKL data cleaning completed successfully!")
        print("🎉 Your cleaned data is now available as 'pkl_data_cleaned'")
        
        # Quick verification of cleaned data
        print(f"\n📊 Cleaned data verification:")
        print(f"Shape: {pkl_data_cleaned.shape}")
        if 'datetime_local' in pkl_data_cleaned.columns:
            print(f"Date range: {pkl_data_cleaned['datetime_local'].min()} to {pkl_data_cleaned['datetime_local'].max()}")
        
        # Check for smoothed columns
        smoothed_cols = [col for col in pkl_data_cleaned.columns if 'smoothed' in col]
        print(f"Smoothed columns created: {len(smoothed_cols)}")
        if smoothed_cols:
            print(f"Examples: {smoothed_cols[:3]}")
        
    except Exception as e:
        print(f"❌ Cleaning failed: {e}")
        print(f"Error type: {type(e).__name__}")
        
        # Let's try a more targeted approach
        print("\n🔧 Trying alternative approach...")
        
        # Check what specific error we're getting
        import traceback
        print("Full error traceback:")
        traceback.print_exc()
        
        # Try just the DEMA smoothing step to see if that works
        print("\n🔧 Testing DEMA smoothing only...")
        try:
            test_df = df_for_cleaning.copy()
            
            # Try DEMA smoothing for just IR first
            cleaner_test = PKLDataCleaner(wavelengths_to_filter=['IR'], verbose=True)
            test_df_smoothed = cleaner_test.dema_bc_and_atn(test_df, DEMA_span_min=10, wl='IR')
            
            print(f"✅ DEMA test successful, shape: {test_df_smoothed.shape}")
            
            # Check if smoothed columns were created
            ir_smoothed_cols = [col for col in test_df_smoothed.columns if 'IR' in col and 'smoothed' in col]
            print(f"IR smoothed columns created: {ir_smoothed_cols}")
            
        except Exception as e2:
            print(f"❌ DEMA test also failed: {e2}")
            
            # If DEMA fails, we might need to use the external calibration module differently
            print("\n💡 The issue might be with the calibration module import.")
            print("Let's check if the calibration module is properly accessible...")

print("\n" + "=" * 60)

🚀 Using renamed columns for PKL data cleaning
✅ Using pkl_data_renamed with shape: (1665156, 239)

🔧 Applying additional preprocessing...
✅ Filtered to 2022+, shape now: (1627058, 239)

🧹 Initializing cleaner with renamed data...

🧹 Starting cleaning pipeline...
Starting PKL data cleaning pipeline...
🔄 Applying DEMA smoothing...
⚠️ Column Blue BC1 not found, skipping
⚠️ Error in DEMA smoothing for Blue: index 0 is out of bounds for axis 0 with size 0
✅ DEMA smoothing applied for Blue
⚠️ Column Green BC1 not found, skipping
⚠️ Error in DEMA smoothing for Green: index 0 is out of bounds for axis 0 with size 0
⚠️ Column Red BC1 not found, skipping
⚠️ Error in DEMA smoothing for Red: index 0 is out of bounds for axis 0 with size 0
⚠️ Column UV BC1 not found, skipping
⚠️ Error in DEMA smoothing for UV: index 0 is out of bounds for axis 0 with size 0
⚠️ Column IR BC1 not found, skipping
⚠️ Error in DEMA smoothing for IR: index 0 is out of bounds for axis 0 with size 0
✅ DEMA smoothing applie

Traceback (most recent call last):
  File "/Users/ahzs645/Github/aethmodular/.venv/lib/python3.13/site-packages/pandas/core/ops/array_ops.py", line 218, in _na_arithmetic_op
    result = func(left, right)
  File "/Users/ahzs645/Github/aethmodular/.venv/lib/python3.13/site-packages/pandas/core/computation/expressions.py", line 242, in evaluate
    return _evaluate(op, op_str, a, b)  # type: ignore[misc]
  File "/Users/ahzs645/Github/aethmodular/.venv/lib/python3.13/site-packages/pandas/core/computation/expressions.py", line 73, in _evaluate_standard
    return op(a, b)
TypeError: can't multiply sequence by non-int of type 'float'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/var/folders/5y/nymdg1zj29dglz6b4sf4v02c0000gn/T/ipykernel_19051/1510792257.py", line 42, in <module>
    pkl_data_cleaned = cleaner.clean_pipeline(df_for_cleaning)
  File "/Users/ahzs645/Github/aethmodular/src/data/qc/pkl_cleaning.py", line 474, in 

⚠️ Column IR BC1 not found, skipping
⚠️ Error in DEMA smoothing for IR: index 0 is out of bounds for axis 0 with size 0
✅ DEMA test successful, shape: (1627058, 239)
IR smoothed columns created: []



In [8]:
if 'pkl_data_cleaned' in locals() and len(pkl_data_cleaned) > 0:
    # Use the enhanced setup's built-in cleaning method for comparison/validation
    print("\n🔄 Using enhanced setup integration...")
    
    try:
        # This method provides additional integration features
        pkl_alternative = setup.clean_pkl_dataset(
            dataset_name='pkl_data',
            wavelengths_to_filter=['IR', 'Blue'],
            store_as='pkl_data_cleaned_setup',
            run_quality_assessment=True
        )
        
        # Print enhanced summary
        setup.print_enhanced_summary()
        
        # Access your cleaned data
        print(f"\n📊 Available datasets: {list(setup.datasets.keys())}")
        
        # Get BC data for specific wavelength
        red_bc_cleaned = setup.get_bc_data_for_wavelength('pkl_data_cleaned', 'Red')
        
        # Get excellent quality periods
        excellent_periods = setup.get_excellent_periods('pkl_data_cleaned')
        
        # Access FTIR data for merging
        ftir_data = setup.get_ftir_data()
        
        print("\n✅ Ready for analysis with cleaned PKL data!")
        print(f"🔬 Red BC data shape: {red_bc_cleaned.shape if red_bc_cleaned is not None else 'Not available'}")
        print(f"⭐ Excellent periods: {len(excellent_periods) if excellent_periods is not None else 'Not available'}")
        print(f"🔬 FTIR data shape: {ftir_data.shape if ftir_data is not None else 'Not available'}")
        
    except Exception as e:
        print(f"⚠️ Enhanced setup integration had issues: {e}")
        print("But your direct cleaning method worked, so you can continue with pkl_data_cleaned")

In [9]:
if 'pkl_data_cleaned' in locals() and len(pkl_data_cleaned) > 0:
    # Export cleaned data
    pkl_data_cleaned.to_csv('pkl_data_cleaned_simple.csv', index=False)
    pkl_data_cleaned.to_pickle('pkl_data_cleaned_simple.pkl')
    
    print("\n💾 Cleaned data exported:")
    print("- pkl_data_cleaned_simple.csv")
    print("- pkl_data_cleaned_simple.pkl")
    
    print(f"\n🎉 PKL Data Cleaning Complete!")
    print(f"📊 Final shape: {pkl_data_cleaned.shape}")
    print(f"📅 Date range: {pkl_data_cleaned['datetime_local'].min()} to {pkl_data_cleaned['datetime_local'].max()}")
    print("🚀 Ready for further analysis!")
    
else:
    print("❌ No cleaned data available to export")
    print("Please check the error messages above and run the troubleshooting cell")

❌ No cleaned data available to export
Please check the error messages above and run the troubleshooting cell


In [10]:
# For detailed analysis, you can run individual cleaning steps
print("\n🔍 Step-by-step cleaning analysis...")
print("=" * 60)

# Start with original data
df_step = pkl_data_original.copy()
print(f"Starting with: {len(df_step):,} data points")

# Step 1: Status cleaning
df_step = cleaner.clean_by_status(df_step)

# Step 2: Extreme BCc cleaning  
df_step = cleaner.clean_extreme_bcc(df_step)

# Step 3: Flow range cleaning
df_step = cleaner.clean_flow_range(df_step)

# Step 4: Flow ratio cleaning
df_step = cleaner.clean_flow_ratio(df_step)

# Step 5: Leak ratio cleaning
df_step = cleaner.clean_leak_ratio(df_step)

# Step 6: BCc denominator cleaning
df_step = cleaner.clean_bcc_denominator(df_step)

# Step 7: Temperature change cleaning
df_step = cleaner.clean_temperature_change(df_step)

# Step 8: Roughness-based cleaning
df_step = cleaner.add_roughness_columns(df_step)
df_step, roughness_stds = cleaner.flag_high_roughness_periods(df_step)

print(f"Final cleaned data: {len(df_step):,} data points")


🔍 Step-by-step cleaning analysis...
Starting with: 1,665,156 data points
1934 datapoints removed due to Start up or Tape advance status
Statuses of concern, count by device and status:

MA350-0238 Flow unstable 1
MA350-0238 Optical saturation 0
MA350-0238 Sample timing error 0
WF0013 Flow unstable 749
WF0013 Optical saturation 0
WF0013 Sample timing error 0
Number of datapoints with invalid optics values
AFTER dropping data with 'Optical saturation' status values: 823
Removed 71954 datapoints for optics
Status cleaning: Removed 74638 rows (4.48%)
⚠️ Missing columns for IR extreme BCc cleaning: IR BCc smoothed or IR ATN1
⚠️ Missing columns for Blue extreme BCc cleaning: Blue BCc smoothed or Blue ATN1
Extreme BCc cleaning: Removed 0 rows (0.00%)


KeyError: 'Flow total (mL/min)'

In [None]:
# Visualize the cleaning results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Data availability over time
pkl_data_cleaned.set_index('datetime_local').resample('D').size().plot(ax=axes[0,0])
axes[0,0].set_title('Daily Data Availability After Cleaning')
axes[0,0].set_ylabel('Data Points per Day')
axes[0,0].grid(True, alpha=0.3)

# Plot 2: BC values for selected wavelength
wavelength = config.wavelength
if f'{wavelength} BCc smoothed' in pkl_data_cleaned.columns:
    pkl_data_cleaned[f'{wavelength} BCc smoothed'].hist(bins=50, alpha=0.7, ax=axes[0,1])
    axes[0,1].set_title(f'{wavelength} BCc Values (Cleaned)')
    axes[0,1].set_xlabel('BCc (µg/m³)')
    axes[0,1].set_ylabel('Frequency')
    axes[0,1].grid(True, alpha=0.3)

# Plot 3: Temperature data
if 'Sample temp (C)' in pkl_data_cleaned.columns:
    pkl_data_cleaned['Sample temp (C)'].plot(ax=axes[1,0], alpha=0.7)
    axes[1,0].set_title('Sample Temperature Over Time')
    axes[1,0].set_ylabel('Temperature (°C)')
    axes[1,0].grid(True, alpha=0.3)

# Plot 4: Flow data
if 'Flow total (mL/min)' in pkl_data_cleaned.columns:
    pkl_data_cleaned['Flow total (mL/min)'].hist(bins=50, alpha=0.7, ax=axes[1,1])
    axes[1,1].set_title('Flow Rate Distribution')
    axes[1,1].set_xlabel('Flow Rate (mL/min)')
    axes[1,1].set_ylabel('Frequency')
    axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Save the cleaned data
output_path = "pkl_data_cleaned.csv"
pkl_data_cleaned.to_csv(output_path, index=False)
print(f"✅ Cleaned data exported to: {output_path}")

# Also save as pickle for faster loading
pickle_output_path = "pkl_data_cleaned.pkl"
pkl_data_cleaned.to_pickle(pickle_output_path)
print(f"✅ Cleaned data exported to: {pickle_output_path}")

In [None]:
# Use the cleaned data with the existing quality assessment system
print("\n🔍 Assessing quality of cleaned data...")
setup.datasets['pkl_data_cleaned'] = pkl_data_cleaned

# Run quality assessment on cleaned data
quality_results = setup.assess_data_quality()

# Get excellent quality periods from cleaned data
excellent_periods = setup.get_excellent_periods('pkl_data_cleaned')
print(f"Excellent quality periods found: {len(excellent_periods)}")

# Get BC data for the configured wavelength from cleaned data
wavelength_bc = setup.get_bc_data_for_wavelength('pkl_data_cleaned', config.wavelength)
if wavelength_bc is not None:
    print(f"{config.wavelength} BC data shape: {wavelength_bc.shape}")
else:
    print(f"No {config.wavelength} BC data available in cleaned dataset")

# Print final summary
print("\n🎉 PKL Data Cleaning Complete!")
print("=" * 60)
print(f"📊 Original data: {len(pkl_data_original):,} points")
print(f"🧹 Cleaned data: {len(pkl_data_cleaned):,} points")
print(f"🗑️ Removed: {len(pkl_data_original) - len(pkl_data_cleaned):,} points ({((len(pkl_data_original) - len(pkl_data_cleaned)) / len(pkl_data_original) * 100):.2f}%)")
print(f"📅 Date range: {pkl_data_cleaned['datetime_local'].min()} to {pkl_data_cleaned['datetime_local'].max()}")
print(f"🌈 Wavelengths processed: {cleaner.wls_to_filter}")
print(f"💾 Cleaned data saved to: {output_path} and {pickle_output_path}")

In [None]:
# Example of using custom cleaning parameters
print("\n🛠️ Advanced Usage: Custom Cleaning Parameters")
print("=" * 60)

# Initialize cleaner with custom parameters
custom_cleaner = PKLDataCleaner(
    wavelengths_to_filter=['IR', 'Blue', 'Red'],  # More wavelengths
    verbose=True
)

# Apply individual cleaning steps with custom parameters
df_custom = pkl_data_original.copy()

# Custom flow range cleaning with tighter tolerance
df_custom = custom_cleaner.clean_flow_range(df_custom, flow_threshold=0.05, setpoint=100)

# Custom temperature change cleaning (this would need the method to accept parameters)
# df_custom = custom_cleaner.clean_temperature_change(df_custom)

# Custom roughness cleaning with different z-threshold
df_custom = custom_cleaner.add_roughness_columns(df_custom)
df_custom, _ = custom_cleaner.flag_high_roughness_periods(
    df_custom, 
    z_threshold=1.5,  # More sensitive
    min_len=5,        # Shorter periods
    min_frac_high=0.5 # Lower fraction threshold
)

print(f"Custom cleaning result: {len(df_custom):,} data points remaining")

In [None]:
# Alternative approach: Load and clean data in one step using the convenience function
print("\n🚀 Alternative: Load and Clean in One Step")
print("=" * 60)

# If you want to load directly from directory and clean
# (Adjust the directory path to your JPL_aeth directory)
try:
    # This function loads from directory and applies all cleaning steps
    df_loaded_and_cleaned = load_and_clean_pkl_data(
        directory_path="../JPL_aeth/",  # Adjust path as needed
        wavelengths_to_filter=['IR', 'Blue'],
        verbose=True,
        summary=True
    )
    print(f"Loaded and cleaned data shape: {df_loaded_and_cleaned.shape}")
    
except FileNotFoundError:
    print("Directory not found. Using already loaded data instead.")
    df_loaded_and_cleaned = pkl_data_cleaned
    
except Exception as e:
    print(f"Error in load_and_clean_pkl_data: {e}")
    print("Using already cleaned data instead.")
    df_loaded_and_cleaned = pkl_data_cleaned

print("\n✅ PKL Data Cleaning Pipeline Complete!")
print("Your cleaned data is now available in the following variables:")
print("- pkl_data_cleaned: Cleaned version of your original pkl data")
print("- df_loaded_and_cleaned: Alternative cleaned dataset")
print("- Use these datasets for further analysis, merging with FTIR data, etc.")