
# Filter Data Availability Strip Chart

This notebook builds the site-level strip chart and supporting counts requested in the MAIA data availability review.



**Goals**

- Visualize sample-day availability for FTIR-EC, HIPS Fabs, and Aethalometer data across Addis Ababa (ETAD), Pasadena/JPL (USPA), Beijing (CHTS), and Delhi (INDH).
- Summarize the number of days with paired FTIR+HIPS measurements, and where available, days with all three data sets.
- Verify that JPL (USPA) has no FTIR/HIPS sample dates after December 2023 so we avoid analysis-date mix-ups.


In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

sns.set_theme(style='whitegrid', context='talk')
plt.rcParams.update({
    'figure.figsize': (12, 4),
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.facecolor': 'white'
})

print('✅ Libraries loaded')


In [None]:

# Paths and site configuration
BASE_DIR = Path.cwd()
FILTER_DB_PATH = (BASE_DIR / "../FTIR_HIPS_Chem/Filter Data/unified_filter_dataset.pkl").resolve()

AETH_PATHS = {
    'ETAD': (BASE_DIR / "../FTIR_HIPS_Chem/df_Jacros_9am_resampled.pkl").resolve(),
    'USPA': None,  # Placeholder – no local aethalometer PKL committed yet
    'CHTS': None,
    'INDH': None,
}

SITE_DISPLAY_NAMES = {
    'ETAD': 'Addis Ababa',
    'USPA': 'Pasadena / JPL',
    'CHTS': 'Beijing',
    'INDH': 'Delhi'
}

MEASUREMENT_ORDER = ['FTIR-EC', 'HIPS Fabs', 'Aethalometer']
MEASUREMENT_POSITIONS = {name: idx for idx, name in enumerate(MEASUREMENT_ORDER)}
PALETTE = {
    'FTIR-EC': '#1f77b4',
    'HIPS Fabs': '#ff7f0e',
    'Aethalometer': '#2ca02c'
}

print(f"📂 Working directory: {BASE_DIR}")
print(f"📄 Filter DB path: {FILTER_DB_PATH}")
for site, path in AETH_PATHS.items():
    print(f"   Aeth path {site}: {path if path else '—'}")


In [None]:

# Load and filter the unified filter dataset
filter_df = pd.read_pickle(FILTER_DB_PATH)
filter_df['SampleDate'] = pd.to_datetime(filter_df['SampleDate'], errors='coerce')
filter_df = filter_df.dropna(subset=['SampleDate']).copy()
filter_df['SampleDate'] = filter_df['SampleDate'].dt.normalize()
filter_df['Site'] = filter_df['Site'].astype(str)

sites_of_interest = list(SITE_DISPLAY_NAMES.keys())
filter_df = filter_df[filter_df['Site'].isin(sites_of_interest)].copy()

print('🔢 Total filter records kept:', len(filter_df))
filter_df[['Site', 'DataSource', 'Parameter', 'SampleDate']].head()


In [None]:

# Helper functions to assemble availability records

def build_filter_measurement(df, data_source, parameter, label):
    subset = df[(df['DataSource'] == data_source) & (df['Parameter'] == parameter)].copy()
    subset = subset[['Site', 'SampleDate']].drop_duplicates()
    subset['MeasurementType'] = label
    return subset

availability_frames = [
    build_filter_measurement(filter_df, 'FTIR', 'EC_ftir', 'FTIR-EC'),
    build_filter_measurement(filter_df, 'HIPS', 'HIPS_Fabs', 'HIPS Fabs')
]

missing_aeth_sites = []
for site_code, path in AETH_PATHS.items():
    if path and path.exists():
        aeth_df = pd.read_pickle(path)
        if 'datetime_local' in aeth_df.columns:
            dt = pd.to_datetime(aeth_df['datetime_local'], errors='coerce')
        else:
            dt = pd.to_datetime(aeth_df.index, errors='coerce')
        dt = dt.dropna()
        if hasattr(dt, 'dt') and dt.dt.tz is not None:
            dt = dt.dt.tz_localize(None)
        dates = pd.Series(dt.dt.normalize().unique()).dropna()
        if not dates.empty:
            temp = pd.DataFrame({
                'Site': site_code,
                'SampleDate': pd.to_datetime(dates.values),
                'MeasurementType': 'Aethalometer'
            })
            availability_frames.append(temp)
        else:
            missing_aeth_sites.append(site_code)
    else:
        missing_aeth_sites.append(site_code)

if availability_frames:
    chart_data = pd.concat(availability_frames, ignore_index=True)
else:
    chart_data = pd.DataFrame(columns=['Site', 'SampleDate', 'MeasurementType'])

chart_data['SiteName'] = chart_data['Site'].map(SITE_DISPLAY_NAMES)
chart_data['MeasurementType'] = pd.Categorical(chart_data['MeasurementType'], categories=MEASUREMENT_ORDER, ordered=True)
chart_data = chart_data.dropna(subset=['SampleDate']).drop_duplicates()
chart_data['MeasurementPos'] = chart_data['MeasurementType'].map(MEASUREMENT_POSITIONS).astype(float)

print('📈 Availability records:', len(chart_data))
if missing_aeth_sites:
    print('⚠️ Aethalometer PKL not found for sites:', ', '.join(sorted(missing_aeth_sites)))

chart_data.head()


In [None]:

# Build per-site counts for FTIR, HIPS, and Aethalometer availability and overlaps
count_rows = []
for site in SITE_DISPLAY_NAMES:
    site_mask = chart_data['Site'] == site
    ft_dates = set(chart_data.loc[site_mask & (chart_data['MeasurementType'] == 'FTIR-EC'), 'SampleDate'])
    hips_dates = set(chart_data.loc[site_mask & (chart_data['MeasurementType'] == 'HIPS Fabs'), 'SampleDate'])
    aeth_dates = set(chart_data.loc[site_mask & (chart_data['MeasurementType'] == 'Aethalometer'), 'SampleDate'])

    count_rows.append({
        'Site': SITE_DISPLAY_NAMES[site],
        'FTIR Days': len(ft_dates),
        'HIPS Days': len(hips_dates),
        'Aeth Days': len(aeth_dates),
        'FTIR + HIPS Days': len(ft_dates & hips_dates),
        'FTIR + HIPS + Aeth Days': len(ft_dates & hips_dates & aeth_dates) if aeth_dates else 0
    })

counts_df = pd.DataFrame(count_rows)
counts_df


In [None]:

# Sanity check: ensure no USPA (Pasadena/JPL) sample dates occur in 2024
jpl_late = chart_data[
    (chart_data['Site'] == 'USPA') &
    (chart_data['MeasurementType'].isin(['FTIR-EC', 'HIPS Fabs'])) &
    (chart_data['SampleDate'] >= '2024-01-01')
]

if not jpl_late.empty:
    display(jpl_late.sort_values('SampleDate'))
    raise ValueError('Unexpected USPA sample dates detected in 2024 – investigate sample vs analysis dates.')
else:
    print('✅ JPL (USPA) shows no FTIR/HIPS sample dates after 2023-12-31.')


In [None]:

# Create individual strip charts per site for better readability
if chart_data.empty:
    print('No availability data to plot.')
else:
    legend_handles = [
        Line2D([0], [0], marker='o', color='w', markerfacecolor=PALETTE[m],
               markeredgecolor='black', markersize=10, label=m) for m in MEASUREMENT_ORDER
    ]

    for idx, (site_code, site_label) in enumerate(SITE_DISPLAY_NAMES.items()):
        site_df = chart_data[chart_data['Site'] == site_code]
        fig, ax = plt.subplots(figsize=(12, 3.2))

        if site_df.empty:
            ax.text(0.5, 0.5, 'No data available', transform=ax.transAxes,
                    ha='center', va='center', fontsize=12, fontweight='bold')
            ax.set_axis_off()
        else:
            for measurement in MEASUREMENT_ORDER:
                measurement_df = site_df[site_df['MeasurementType'] == measurement]
                if not measurement_df.empty:
                    ax.scatter(
                        measurement_df['SampleDate'],
                        [MEASUREMENT_POSITIONS[measurement]] * len(measurement_df),
                        color=PALETTE[measurement],
                        edgecolor='black',
                        linewidth=0.4,
                        s=65,
                        label=measurement if idx == 0 else None
                    )

            ax.set_yticks(list(MEASUREMENT_POSITIONS.values()))
            ax.set_yticklabels(MEASUREMENT_ORDER)
            ax.set_ylim(-0.5, len(MEASUREMENT_ORDER) - 0.5)
            ax.set_xlabel('Sample Date')
            ax.set_ylabel('')
            ax.set_title(f'{site_label} Sample-Day Availability')
            ax.grid(axis='y', linestyle=':', alpha=0.35)
            ax.margins(x=0.02)

            if idx == 0:
                ax.legend(handles=legend_handles, loc='upper right', frameon=True)

        plt.tight_layout()
        plt.show()


In [None]:

# Display the counts table in a compact format
counts_display = counts_df.set_index('Site').sort_index()
counts_display



### Notes

- Aethalometer availability is currently only ingested for Addis Ababa (ETAD); the other sites will populate once their PKL exports are committed.
- All availability calculations rely on *SampleDate* to avoid the analysis-date mix-up noted for JPL 2024. The validation cell confirms there are no USPA FTIR/HIPS samples dated after December 2023.
- The counts table highlights days with paired FTIR+HIPS observations and, where available, triple overlaps including the aethalometer.
