# Flow Fix Period Explorer

This notebook helps identify before/after flow fix time periods for each site by visualizing:
- HIPS measurements over time
- FTIR EC measurements over time  
- Aethalometer BC measurements over time

Use these plots to identify dates where data quality or instrument behavior changed.

---

## Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy import stats

# Add scripts folder to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
scripts_path = os.path.join(notebook_dir, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

# Core imports from modular scripts
from config import SITES, MAC_VALUE, FLOW_FIX_PERIODS
from data_matching import (
    load_aethalometer_data, 
    load_filter_data,
    add_base_filter_id,
    match_all_parameters
)

# Configure matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 10)
plt.rcParams['font.size'] = 10

# Site colors
SITE_COLORS = {
    'Beijing': '#1f77b4', 
    'Delhi': '#ff7f0e', 
    'JPL': '#2ca02c', 
    'Addis_Ababa': '#d62728'
}

print("Imports successful!")
print(f"Sites: {list(SITES.keys())}")

## Load Data

In [None]:
# Load aethalometer data
aethalometer_data = load_aethalometer_data()
print(f"Loaded aethalometer data for: {list(aethalometer_data.keys())}")

# Load filter data
filter_data = load_filter_data()
filter_data = add_base_filter_id(filter_data)
print(f"Filter data: {len(filter_data)} records")

In [None]:
# Match all parameters for each site
all_params_data = {}

for site_name in SITES:
    if site_name not in aethalometer_data:
        continue
    
    config = SITES[site_name]
    df_aeth = aethalometer_data[site_name]
    
    matched = match_all_parameters(
        site_name, config['code'], df_aeth, filter_data
    )
    
    if matched is not None and len(matched) >= 3:
        matched['date'] = pd.to_datetime(matched['date'])
        matched = matched.sort_values('date')
        all_params_data[site_name] = matched
        
        print(f"{site_name}: {len(matched)} samples")
        print(f"  Date range: {matched['date'].min().strftime('%Y-%m-%d')} to {matched['date'].max().strftime('%Y-%m-%d')}")
    else:
        print(f"{site_name}: Insufficient data")

## Current Flow Fix Period Configuration

Show what's currently defined in `config.py`:

In [None]:
print("\n" + "="*70)
print("CURRENT FLOW_FIX_PERIODS CONFIGURATION")
print("="*70)

for site_name in SITES:
    config = FLOW_FIX_PERIODS.get(site_name, {})
    if config:
        print(f"\n{site_name}:")
        print(f"  Description: {config.get('description', 'N/A')}")
        print(f"  Before end: {config.get('before_end', 'NOT SET')}")
        print(f"  After start: {config.get('after_start', 'NOT SET')}")
        print(f"  Notes: {config.get('notes', 'N/A')}")
    else:
        print(f"\n{site_name}: No flow fix periods defined")

---

## Stacked Time Series Plots - All Sites

Each site gets a figure with 3 stacked subplots:
1. **HIPS Fabs** (converted to BC equivalent)
2. **FTIR EC**
3. **Aethalometer IR BC**

Look for:
- Sudden changes in data patterns
- Gaps in data collection
- Changes in variability or baseline
- Dates where correlations might have changed

In [None]:
def plot_stacked_timeseries(site_name, df, flow_config=None):
    """
    Create stacked time series plot for a site showing HIPS, FTIR EC, and Aethalometer.
    
    Parameters:
    -----------
    site_name : str
    df : DataFrame with date, hips_fabs, ftir_ec, ir_bcc columns
    flow_config : dict with before_end and after_start dates (optional)
    """
    color = SITE_COLORS.get(site_name, '#333333')
    
    fig, axes = plt.subplots(3, 1, figsize=(16, 12), sharex=True)
    
    # Data columns and labels
    plot_configs = [
        ('hips_fabs', 'HIPS Fabs / MAC (µg/m³)', '#9467bd'),  # Purple
        ('ftir_ec', 'FTIR EC (µg/m³)', '#d62728'),            # Red
        ('ir_bcc', 'Aethalometer IR BCc (µg/m³)', '#1f77b4')  # Blue
    ]
    
    dates = df['date']
    
    for ax, (col, ylabel, plot_color) in zip(axes, plot_configs):
        if col in df.columns:
            valid_mask = df[col].notna()
            valid_dates = dates[valid_mask]
            valid_data = df.loc[valid_mask, col]
            
            # Scatter plot
            ax.scatter(valid_dates, valid_data, c=plot_color, alpha=0.7, s=50, 
                      edgecolors='black', linewidth=0.3, label=f'n={len(valid_data)}')
            
            # Add rolling mean (30-day window)
            if len(valid_data) > 10:
                df_temp = pd.DataFrame({'date': valid_dates, 'value': valid_data}).set_index('date').sort_index()
                rolling_mean = df_temp['value'].rolling(window='30D', min_periods=3).mean()
                ax.plot(rolling_mean.index, rolling_mean.values, color='black', 
                       linewidth=2, alpha=0.7, label='30-day rolling mean')
            
            ax.set_ylabel(ylabel, fontsize=11)
            ax.legend(loc='upper right', fontsize=9)
            
            # Add statistics text
            stats_text = f"Mean: {valid_data.mean():.2f}, Median: {valid_data.median():.2f}"
            ax.text(0.02, 0.95, stats_text, transform=ax.transAxes, fontsize=9,
                   verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        else:
            ax.text(0.5, 0.5, f'{col} not available', transform=ax.transAxes,
                   ha='center', va='center', fontsize=12, color='gray')
            ax.set_ylabel(ylabel, fontsize=11)
        
        ax.grid(True, alpha=0.3)
        ax.set_ylim(bottom=0)

    # Format x-axis
    axes[-1].set_xlabel('Date', fontsize=11)
    axes[-1].xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    axes[-1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.setp(axes[-1].xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # Title
    date_range = f"{dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}"
    fig.suptitle(f'{site_name}: Time Series Overview\n{date_range}', 
                fontsize=14, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.show()
    
    return fig

In [None]:
# Plot all sites
for site_name in SITES:
    if site_name in all_params_data:
        print(f"\n{'='*70}")
        print(f"{site_name}")
        print(f"{'='*70}")
        
        flow_config = FLOW_FIX_PERIODS.get(site_name, {})
        plot_stacked_timeseries(site_name, all_params_data[site_name], flow_config)

---

## Rolling Correlation Analysis

Calculate rolling correlation between measurements to identify when relationships changed.
A sudden change in correlation might indicate instrument issues or fixes.

In [None]:
def plot_rolling_correlation(site_name, df, window_days=60):
    """
    Plot rolling correlation between measurement types over time.
    """
    fig, axes = plt.subplots(3, 1, figsize=(16, 10), sharex=True)
    
    df = df.copy().set_index('date').sort_index()
    
    correlations = [
        ('ir_bcc', 'ftir_ec', 'Aeth BC vs FTIR EC', '#1f77b4'),
        ('hips_fabs', 'ftir_ec', 'HIPS vs FTIR EC', '#9467bd'),
        ('hips_fabs', 'ir_bcc', 'HIPS vs Aeth BC', '#2ca02c')
    ]
    
    for ax, (col1, col2, label, color) in zip(axes, correlations):
        if col1 in df.columns and col2 in df.columns:
            # Calculate rolling correlation
            valid_mask = df[col1].notna() & df[col2].notna()
            
            if valid_mask.sum() > window_days // 2:
                rolling_corr = df[col1].rolling(window=f'{window_days}D', min_periods=5).corr(df[col2])
                
                ax.plot(rolling_corr.index, rolling_corr.values, color=color, linewidth=2, alpha=0.8)
                ax.fill_between(rolling_corr.index, rolling_corr.values, alpha=0.3, color=color)
                
                # Add reference lines
                ax.axhline(y=1.0, color='green', linestyle='--', alpha=0.5, label='Perfect correlation')
                ax.axhline(y=0.8, color='orange', linestyle=':', alpha=0.5, label='R=0.8')
                ax.axhline(y=0.5, color='red', linestyle=':', alpha=0.5, label='R=0.5')
                
                # Overall correlation
                overall_corr = df.loc[valid_mask, col1].corr(df.loc[valid_mask, col2])
                ax.text(0.02, 0.95, f'Overall R: {overall_corr:.3f}', transform=ax.transAxes,
                       fontsize=10, va='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            else:
                ax.text(0.5, 0.5, 'Insufficient data for rolling correlation',
                       transform=ax.transAxes, ha='center', va='center')
        else:
            ax.text(0.5, 0.5, f'Missing columns: {col1} or {col2}',
                   transform=ax.transAxes, ha='center', va='center')
        
        ax.set_ylabel(f'{label}\nCorrelation (R)', fontsize=10)
        ax.set_ylim(-0.2, 1.1)
        ax.grid(True, alpha=0.3)
        ax.legend(loc='lower right', fontsize=8)

    axes[-1].set_xlabel('Date', fontsize=11)
    axes[-1].xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    axes[-1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.setp(axes[-1].xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    fig.suptitle(f'{site_name}: Rolling {window_days}-Day Correlation', 
                fontsize=14, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.show()
    
    return fig

In [None]:
# Plot rolling correlations for all sites
for site_name in SITES:
    if site_name in all_params_data:
        print(f"\n{'='*70}")
        print(f"{site_name}: Rolling Correlation")
        print(f"{'='*70}")
        
        plot_rolling_correlation(site_name, all_params_data[site_name], window_days=60)

---

## Ratio Time Series (Aeth/FTIR and HIPS/FTIR)

Plot the ratio of measurements over time. Changes in the ratio might indicate calibration issues or flow problems.

In [None]:
def plot_ratio_timeseries(site_name, df):
    """
    Plot measurement ratios over time to identify systematic changes.
    """
    fig, axes = plt.subplots(2, 1, figsize=(16, 8), sharex=True)
    
    df = df.copy()
    dates = df['date']
    
    ratios = [
        ('ir_bcc', 'ftir_ec', 'Aeth BC / FTIR EC', '#1f77b4'),
        ('hips_fabs', 'ftir_ec', 'HIPS / FTIR EC', '#9467bd')
    ]
    
    for ax, (num_col, denom_col, label, color) in zip(axes, ratios):
        if num_col in df.columns and denom_col in df.columns:
            # Calculate ratio (avoid division by zero)
            valid_mask = (df[num_col].notna() & df[denom_col].notna() & 
                         (df[denom_col] > 0.01))  # Minimum threshold
            
            ratio = df.loc[valid_mask, num_col] / df.loc[valid_mask, denom_col]
            valid_dates = dates[valid_mask]
            
            # Remove extreme outliers for visualization
            q1, q3 = ratio.quantile(0.05), ratio.quantile(0.95)
            display_mask = (ratio >= q1 * 0.5) & (ratio <= q3 * 2)
            
            ax.scatter(valid_dates[display_mask], ratio[display_mask], 
                      c=color, alpha=0.6, s=50, edgecolors='black', linewidth=0.3)
            
            # Rolling mean
            df_temp = pd.DataFrame({'date': valid_dates, 'ratio': ratio}).set_index('date').sort_index()
            rolling_mean = df_temp['ratio'].rolling(window='30D', min_periods=3).mean()
            ax.plot(rolling_mean.index, rolling_mean.values, color='black', 
                   linewidth=2, alpha=0.8, label='30-day rolling mean')
            
            # Reference line at 1.0
            ax.axhline(y=1.0, color='green', linestyle='--', linewidth=2, alpha=0.7, label='Ratio = 1.0')
            
            # Stats
            mean_ratio = ratio.mean()
            median_ratio = ratio.median()
            ax.text(0.02, 0.95, f'Mean: {mean_ratio:.2f}, Median: {median_ratio:.2f}, n={len(ratio)}',
                   transform=ax.transAxes, fontsize=10, va='top',
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        else:
            ax.text(0.5, 0.5, f'Missing columns', transform=ax.transAxes, ha='center', va='center')
        
        ax.set_ylabel(label, fontsize=11)
        ax.grid(True, alpha=0.3)
        ax.legend(loc='upper right', fontsize=9)
        ax.set_ylim(0, 3)  # Reasonable range for ratios

    axes[-1].set_xlabel('Date', fontsize=11)
    axes[-1].xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    axes[-1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.setp(axes[-1].xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    fig.suptitle(f'{site_name}: Measurement Ratios Over Time', 
                fontsize=14, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.show()
    
    return fig

In [None]:
# Plot ratio time series for all sites
for site_name in SITES:
    if site_name in all_params_data:
        print(f"\n{'='*70}")
        print(f"{site_name}: Measurement Ratios")
        print(f"{'='*70}")
        
        plot_ratio_timeseries(site_name, all_params_data[site_name])

---

## Interactive Date Range Explorer

Use this cell to test different before/after dates and see the impact on correlations.

In [None]:
def test_flow_fix_dates(site_name, df, before_end, after_start):
    """
    Test specific before/after dates and calculate statistics for each period.
    
    Parameters:
    -----------
    site_name : str
    df : DataFrame
    before_end : str, date string (e.g., '2023-06-15')
    after_start : str, date string (e.g., '2023-07-01')
    """
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    
    before_dt = pd.to_datetime(before_end)
    after_dt = pd.to_datetime(after_start)
    
    before_data = df[df['date'] <= before_dt]
    after_data = df[df['date'] >= after_dt]
    gap_data = df[(df['date'] > before_dt) & (df['date'] < after_dt)]
    
    print(f"\n{'='*70}")
    print(f"{site_name}: Testing Flow Fix Dates")
    print(f"  Before end: {before_end}")
    print(f"  After start: {after_start}")
    print(f"{'='*70}")
    
    print(f"\nSample counts:")
    print(f"  Before: {len(before_data)}")
    print(f"  Gap: {len(gap_data)}")
    print(f"  After: {len(after_data)}")
    
    # Calculate correlations for each period
    comparisons = [
        ('ir_bcc', 'ftir_ec', 'Aeth BC vs FTIR EC'),
        ('hips_fabs', 'ftir_ec', 'HIPS vs FTIR EC'),
    ]
    
    print(f"\n{'Comparison':<25s} {'Before R²':<12s} {'After R²':<12s} {'ΔR²':<12s}")
    print("-" * 65)
    
    results = {}
    
    for col1, col2, label in comparisons:
        if col1 in df.columns and col2 in df.columns:
            # Before period
            valid_before = before_data[col1].notna() & before_data[col2].notna()
            if valid_before.sum() >= 3:
                r_before = before_data.loc[valid_before, col1].corr(before_data.loc[valid_before, col2])
                r2_before = r_before ** 2
            else:
                r2_before = np.nan
            
            # After period
            valid_after = after_data[col1].notna() & after_data[col2].notna()
            if valid_after.sum() >= 3:
                r_after = after_data.loc[valid_after, col1].corr(after_data.loc[valid_after, col2])
                r2_after = r_after ** 2
            else:
                r2_after = np.nan
            
            delta = r2_after - r2_before if not (np.isnan(r2_before) or np.isnan(r2_after)) else np.nan
            
            print(f"{label:<25s} {r2_before:<12.3f} {r2_after:<12.3f} {delta:<+12.3f}")
            
            results[label] = {
                'before_r2': r2_before,
                'after_r2': r2_after,
                'delta': delta,
                'n_before': valid_before.sum(),
                'n_after': valid_after.sum()
            }
    
    return results

In [None]:
# ============================================================
# TEST YOUR DATES HERE
# ============================================================
# Modify these dates based on what you see in the plots above
# Then run this cell to see the impact on correlations

# Example: Test dates for Beijing
if 'Beijing' in all_params_data:
    test_flow_fix_dates(
        'Beijing',
        all_params_data['Beijing'],
        before_end='2023-06-01',  # <-- MODIFY THIS
        after_start='2023-07-01'   # <-- MODIFY THIS
    )

In [None]:
# Example: Test dates for JPL
if 'JPL' in all_params_data:
    test_flow_fix_dates(
        'JPL',
        all_params_data['JPL'],
        before_end='2023-06-01',  # <-- MODIFY THIS
        after_start='2023-07-01'   # <-- MODIFY THIS
    )

In [None]:
# Example: Test dates for Delhi
if 'Delhi' in all_params_data:
    test_flow_fix_dates(
        'Delhi',
        all_params_data['Delhi'],
        before_end='2023-06-01',  # <-- MODIFY THIS
        after_start='2023-07-01'   # <-- MODIFY THIS
    )

In [None]:
# Example: Test dates for Addis_Ababa
if 'Addis_Ababa' in all_params_data:
    test_flow_fix_dates(
        'Addis_Ababa',
        all_params_data['Addis_Ababa'],
        before_end='2023-06-01',  # <-- MODIFY THIS
        after_start='2023-07-01'   # <-- MODIFY THIS
    )

---

## Summary: Data Availability by Site

In [None]:
print("\n" + "="*80)
print("DATA AVAILABILITY SUMMARY")
print("="*80)

print(f"\n{'Site':<15s} {'Date Range':<30s} {'n Total':<10s} {'HIPS':<8s} {'FTIR':<8s} {'Aeth':<8s}")
print("-" * 85)

for site_name in SITES:
    if site_name in all_params_data:
        df = all_params_data[site_name]
        date_range = f"{df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}"
        n_hips = df['hips_fabs'].notna().sum() if 'hips_fabs' in df.columns else 0
        n_ftir = df['ftir_ec'].notna().sum() if 'ftir_ec' in df.columns else 0
        n_aeth = df['ir_bcc'].notna().sum() if 'ir_bcc' in df.columns else 0
        
        print(f"{site_name:<15s} {date_range:<30s} {len(df):<10d} {n_hips:<8d} {n_ftir:<8d} {n_aeth:<8d}")

---

## Recommended Flow Fix Dates

Based on your analysis, update the `FLOW_FIX_PERIODS` dictionary in `scripts/config.py`:

```python
FLOW_FIX_PERIODS = {
    'Beijing': {
        'description': 'Flow rate correction',
        'before_end': 'YYYY-MM-DD',    # <-- Fill in
        'after_start': 'YYYY-MM-DD',   # <-- Fill in
        'notes': 'Your notes here'
    },
    'JPL': {
        'description': 'Flow rate correction',
        'before_end': 'YYYY-MM-DD',    # <-- Fill in
        'after_start': 'YYYY-MM-DD',   # <-- Fill in
        'notes': 'Your notes here'
    },
    'Delhi': {
        'description': 'Flow rate correction',
        'before_end': 'YYYY-MM-DD',    # <-- Fill in (if applicable)
        'after_start': 'YYYY-MM-DD',   # <-- Fill in (if applicable)
        'notes': 'Your notes here'
    },
    'Addis_Ababa': {
        'description': 'Flow rate correction',
        'before_end': 'YYYY-MM-DD',    # <-- Fill in (if applicable)
        'after_start': 'YYYY-MM-DD',   # <-- Fill in (if applicable)
        'notes': 'Your notes here'
    }
}
```

In [None]:
# Final summary cell - fill in your recommended dates
print("\n" + "="*80)
print("RECOMMENDED FLOW FIX DATES (fill in based on your analysis)")
print("="*80)

recommended_dates = {
    'Beijing': {
        'before_end': 'YYYY-MM-DD',  # <-- FILL IN
        'after_start': 'YYYY-MM-DD'  # <-- FILL IN
    },
    'JPL': {
        'before_end': 'YYYY-MM-DD',  # <-- FILL IN
        'after_start': 'YYYY-MM-DD'  # <-- FILL IN
    },
    'Delhi': {
        'before_end': None,  # Set to None if no flow fix period
        'after_start': None
    },
    'Addis_Ababa': {
        'before_end': None,  # Set to None if no flow fix period
        'after_start': None
    }
}

for site_name, dates in recommended_dates.items():
    print(f"\n{site_name}:")
    print(f"  before_end: {dates['before_end']}")
    print(f"  after_start: {dates['after_start']}")