# Multi-Site Aethalometer and Filter Data Analysis

This notebook analyzes aethalometer and filter data from four global sites:
- Beijing, China (CHTS)
- Delhi, India (INDH)
- JPL, California (USPA)
- Addis Ababa, Ethiopia (ETAD)

Each visualization creates:
1. A combined plot with all sites
2. Individual plots for each site

## Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Configure matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

In [None]:
# Define paths
PROCESSED_SITES_DIR = Path('/Users/ahmadjalil/Github/aethmodular/FTIR_HIPS_Chem/processed_sites')
FILTER_DATA_PATH = Path('/Users/ahmadjalil/Github/aethmodular/FTIR_HIPS_Chem/Filter Data/unified_filter_dataset.pkl')

# Site configurations
SITES = {
    'Beijing': {
        'file': 'df_Beijing_9am_resampled.pkl',
        'code': 'CHTS',
        'color': '#E74C3C',  # Red
        'location': 'Beijing, China'
    },
    'Delhi': {
        'file': 'df_Delhi_9am_resampled.pkl',
        'code': 'INDH',
        'color': '#3498DB',  # Blue
        'location': 'Delhi, India'
    },
    'JPL': {
        'file': 'df_JPL_9am_resampled.pkl',
        'code': 'USPA',
        'color': '#2ECC71',  # Green
        'location': 'Pasadena, USA'
    },
    'Addis_Ababa': {
        'file': 'df_Addis_Ababa_9am_resampled.pkl',
        'code': 'ETAD',
        'color': '#F39C12',  # Orange
        'location': 'Addis Ababa, Ethiopia'
    }
}

print("Paths and configurations defined!")

In [None]:
# Load all aethalometer datasets
aethalometer_data = {}

for site_name, config in SITES.items():
    file_path = PROCESSED_SITES_DIR / config['file']
    
    if file_path.exists():
        with open(file_path, 'rb') as f:
            df = pickle.load(f)
        
        # Ensure day_9am is datetime
        df['day_9am'] = pd.to_datetime(df['day_9am'])
        
        aethalometer_data[site_name] = df
        print(f"✓ Loaded {site_name}: {len(df)} records, {df['day_9am'].min().date()} to {df['day_9am'].max().date()}")
    else:
        print(f"✗ File not found: {file_path}")

print(f"\nTotal sites loaded: {len(aethalometer_data)}")

In [None]:
# Load filter dataset
with open(FILTER_DATA_PATH, 'rb') as f:
    filter_data = pickle.load(f)

# Convert SampleDate to datetime
filter_data['SampleDate'] = pd.to_datetime(filter_data['SampleDate'])

print(f"Filter dataset loaded: {len(filter_data)} measurements")
print(f"Sites in filter data: {filter_data['Site'].unique()}")
print(f"Date range: {filter_data['SampleDate'].min().date()} to {filter_data['SampleDate'].max().date()}")
print(f"\nTotal unique parameters: {filter_data['Parameter'].nunique()}")

# Categorize parameters by measurement method/type
print("\n" + "="*80)
print("FILTER DATA BREAKDOWN BY MEASUREMENT TYPE")
print("="*80)

# Define categories
categories = {
    'ChemSpec EC/OC': ['ChemSpec_EC_PM2.5', 'ChemSpec_OC_PM2.5', 'ChemSpec_OM_PM2.5', 'ChemSpec_BC_PM2.5'],
    'FTIR EC/OC': ['EC_ftir', 'OC_ftir', 'OM'],
    'FTIR Functional Groups': ['alcoholCOH', 'alkaneCH', 'carboxylicCOOH', 'naCO'],
    'HIPS': ['HIPS_T1', 'HIPS_Slope', 'HIPS_Intercept', 'HIPS_R1', 'HIPS_t', 'HIPS_tau',
             'HIPS_r', 'HIPS_Fabs', 'HIPS_Uncertainty', 'HIPS_MDL'],
    'ChemSpec Ions': ['ChemSpec_Sulfate_Ion_PM2.5', 'ChemSpec_Nitrate_Ion_PM2.5',
                      'ChemSpec_Ammonium_Ion_PM2.5', 'ChemSpec_Chloride_Ion_PM2.5',
                      'ChemSpec_Sodium_Ion_PM2.5', 'ChemSpec_Potassium_Ion_PM2.5',
                      'ChemSpec_Magnesium_Ion_PM2.5', 'ChemSpec_Calcium_Ion_PM2.5'],
    'ChemSpec Metals': ['ChemSpec_Iron_PM2.5', 'ChemSpec_Aluminum_PM2.5',
                        'ChemSpec_Silicon_PM2.5', 'ChemSpec_Sulfur_PM2.5',
                        'ChemSpec_Calcium_PM2.5', 'ChemSpec_Potassium_PM2.5',
                        'ChemSpec_Zinc_PM2.5', 'ChemSpec_Lead_PM2.5',
                        'ChemSpec_Copper_PM2.5', 'ChemSpec_Manganese_PM2.5']
}

# Print breakdown for each category
for category, params in categories.items():
    # Get parameters that exist in the dataset
    existing_params = [p for p in params if p in filter_data['Parameter'].values]
    
    if len(existing_params) > 0:
        category_data = filter_data[filter_data['Parameter'].isin(existing_params)]
        
        print(f"\n{category}:")
        print(f"  Total measurements: {len(category_data)}")
        print(f"  Parameters ({len(existing_params)}):")
        
        param_counts = category_data['Parameter'].value_counts()
        for param in existing_params[:10]:  # Show first 10
            if param in param_counts.index:
                count = param_counts[param]
                print(f"    - {param}: {count}")
        
        if len(existing_params) > 10:
            print(f"    ... and {len(existing_params) - 10} more parameters")

# Show uncategorized parameters
all_categorized = []
for params in categories.values():
    all_categorized.extend(params)

uncategorized = filter_data[~filter_data['Parameter'].isin(all_categorized)]['Parameter'].unique()
if len(uncategorized) > 0:
    print(f"\nOther Parameters ({len(uncategorized)}):")
    print(f"  {list(uncategorized[:10])}")
    if len(uncategorized) > 10:
        print(f"  ... and {len(uncategorized) - 10} more")

print("\n" + "="*80)
print("BREAKDOWN BY SITE")
print("="*80)

for site in sorted(filter_data['Site'].unique()):
    site_data = filter_data[filter_data['Site'] == site]
    print(f"\n{site}:")
    print(f"  Total measurements: {len(site_data)}")
    print(f"  Unique filters: {site_data['FilterId'].nunique()}")
    print(f"  Date range: {site_data['SampleDate'].min().date()} to {site_data['SampleDate'].max().date()}")
    
    # Show top measurement types
    print(f"  Top measurement types:")
    for category, params in categories.items():
        category_count = len(site_data[site_data['Parameter'].isin(params)])
        if category_count > 0:
            print(f"    - {category}: {category_count} measurements")

## Helper Functions for Plotting

In [None]:
def plot_combined_and_individual(plot_function, title_base, **kwargs):
    """
    Create combined plot and individual plots for each site.
    
    Parameters:
    -----------
    plot_function : callable
        Function that takes (ax, site_name, df, config, **kwargs) and creates a plot
    title_base : str
        Base title for the plots
    **kwargs : dict
        Additional arguments to pass to plot_function
    """
    
    # 1. Combined plot
    fig, ax = plt.subplots(figsize=(14, 7))
    
    for site_name, df in aethalometer_data.items():
        config = SITES[site_name]
        plot_function(ax, site_name, df, config, **kwargs)
    
    ax.set_title(f"{title_base} - All Sites", fontsize=14, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # 2. Individual plots
    for site_name, df in aethalometer_data.items():
        config = SITES[site_name]
        
        fig, ax = plt.subplots(figsize=(12, 6))
        plot_function(ax, site_name, df, config, **kwargs)
        
        ax.set_title(f"{title_base} - {site_name}", fontsize=14, fontweight='bold')
        ax.legend(loc='best')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

print("Helper functions defined!")

## 1. Time Series: Black Carbon Concentrations

In [None]:
def plot_bc_timeseries(ax, site_name, df, config, wavelength='IR'):
    """Plot BC time series for a specific wavelength"""
    col_name = f'{wavelength} BCc'
    
    if col_name in df.columns:
        # Filter valid data
        valid_data = df[df[col_name].notna()].copy()
        
        if len(valid_data) > 0:
            ax.plot(valid_data['day_9am'], valid_data[col_name], 
                   color=config['color'], label=f"{site_name}", 
                   alpha=0.7, linewidth=1.5)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel(f'{wavelength} BC (ng/m³)', fontsize=12)
    ax.tick_params(axis='x', rotation=45)

# Create plots
plot_combined_and_individual(
    plot_bc_timeseries,
    "IR Black Carbon Time Series",
    wavelength='IR'
)

## 2. Multi-Wavelength BC Comparison

In [None]:
def plot_multiwavelength_bc(ax, site_name, df, config):
    """Plot BC for multiple wavelengths"""
    wavelengths = ['UV', 'Blue', 'Green', 'Red', 'IR']
    
    for wavelength in wavelengths:
        col_name = f'{wavelength} BCc'
        
        if col_name in df.columns:
            valid_data = df[df[col_name].notna()].copy()
            
            if len(valid_data) > 0:
                ax.plot(valid_data['day_9am'], valid_data[col_name], 
                       label=f"{wavelength}", alpha=0.6, linewidth=1)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('BC (ng/m³)', fontsize=12)
    ax.tick_params(axis='x', rotation=45)

# Create plots
plot_combined_and_individual(
    plot_multiwavelength_bc,
    "Multi-Wavelength BC Comparison"
)

## 3. BC Distribution (Box Plots)

In [None]:
# Combined box plot
fig, ax = plt.subplots(figsize=(12, 6))

bc_data_combined = []
site_labels = []

for site_name, df in aethalometer_data.items():
    if 'IR BCc' in df.columns:
        valid_data = df['IR BCc'].dropna()
        if len(valid_data) > 0:
            bc_data_combined.append(valid_data)
            site_labels.append(site_name)

bp = ax.boxplot(bc_data_combined, labels=site_labels, patch_artist=True)

# Color boxes
for patch, site_name in zip(bp['boxes'], site_labels):
    patch.set_facecolor(SITES[site_name]['color'])
    patch.set_alpha(0.6)

ax.set_title('IR BC Distribution - All Sites', fontsize=14, fontweight='bold')
ax.set_ylabel('IR BC (ng/m³)', fontsize=12)
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

# Individual box plots (multi-wavelength)
for site_name, df in aethalometer_data.items():
    fig, ax = plt.subplots(figsize=(10, 6))
    
    wavelengths = ['UV', 'Blue', 'Green', 'Red', 'IR']
    bc_data = []
    labels = []
    
    for wavelength in wavelengths:
        col_name = f'{wavelength} BCc'
        if col_name in df.columns:
            valid_data = df[col_name].dropna()
            if len(valid_data) > 0:
                bc_data.append(valid_data)
                labels.append(wavelength)
    
    if len(bc_data) > 0:
        bp = ax.boxplot(bc_data, labels=labels, patch_artist=True)
        
        for patch in bp['boxes']:
            patch.set_facecolor(SITES[site_name]['color'])
            patch.set_alpha(0.6)
        
        ax.set_title(f'BC Distribution by Wavelength - {site_name}', fontsize=14, fontweight='bold')
        ax.set_ylabel('BC (ng/m³)', fontsize=12)
        ax.grid(True, alpha=0.3, axis='y')
        plt.tight_layout()
        plt.show()

## 4. Data Completeness Analysis

In [None]:
def plot_data_completeness(ax, site_name, df, config):
    """Plot data completeness over time"""
    if 'data_completeness_pct' in df.columns:
        ax.plot(df['day_9am'], df['data_completeness_pct'], 
               color=config['color'], label=site_name, 
               alpha=0.7, linewidth=1.5)
        ax.axhline(y=80, color='green', linestyle='--', alpha=0.5, label='High quality (80%)')
        ax.axhline(y=50, color='orange', linestyle='--', alpha=0.5, label='Medium quality (50%)')
    else:
        # For pre-resampled data, show BC availability
        if 'IR BCc' in df.columns:
            completeness = (df['IR BCc'].notna()).astype(int) * 100
            ax.scatter(df['day_9am'], completeness, 
                      color=config['color'], label=site_name, 
                      alpha=0.5, s=20)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Data Completeness (%)', fontsize=12)
    ax.set_ylim(-5, 105)
    ax.tick_params(axis='x', rotation=45)

# Create plots
plot_combined_and_individual(
    plot_data_completeness,
    "Data Completeness Over Time"
)

## 5. Filter vs Aethalometer Comparison

In [None]:
def plot_filter_vs_aethalometer(ax, site_name, df, config, individual=False):
    """Compare aethalometer BC with filter EC measurements"""
    site_code = config['code']
    
    # Get filter data for this site
    site_filters = filter_data[filter_data['Site'] == site_code].copy()
    
    # Get EC measurements
    ec_filters = site_filters[site_filters['Parameter'] == 'ChemSpec_EC_PM2.5'].copy()
    
    if len(ec_filters) > 0 and 'IR BCc' in df.columns:
        # Plot aethalometer BC (solid line)
        valid_aeth = df[df['IR BCc'].notna()].copy()
        ax.plot(valid_aeth['day_9am'], valid_aeth['IR BCc'], 
               color=config['color'], label=f'{site_name} - Aethalometer IR BC', 
               alpha=0.7, linewidth=2, linestyle='-')
        
        # Plot filter EC
        ec_filters = ec_filters.sort_values('SampleDate')
        ec_filters['Concentration_ng'] = ec_filters['Concentration'] * 1000
        
        if individual:
            # For individual plots: use contrasting color and solid line
            # Define contrasting colors for each site
            contrast_colors = {
                'Beijing': '#2ECC71',    # Green (opposite of Red)
                'Delhi': '#F39C12',      # Orange (opposite of Blue)
                'JPL': '#E74C3C',        # Red (opposite of Green)
                'Addis_Ababa': '#3498DB' # Blue (opposite of Orange)
            }
            
            filter_color = contrast_colors.get(site_name, '#95A5A6')  # Gray as fallback
            
            ax.plot(ec_filters['SampleDate'], ec_filters['Concentration_ng'], 
                   color=filter_color, linestyle='-', linewidth=2,
                   label=f'{site_name} - Filter EC', alpha=0.7)
        else:
            # For combined plots: use same color but dotted line
            ax.plot(ec_filters['SampleDate'], ec_filters['Concentration_ng'], 
                   color=config['color'], linestyle=':', linewidth=2.5,
                   label=f'{site_name} - Filter EC', alpha=0.8)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Concentration (ng/m³)', fontsize=12)
    ax.tick_params(axis='x', rotation=45)

# Combined plot
fig, ax = plt.subplots(figsize=(14, 7))

for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    plot_filter_vs_aethalometer(ax, site_name, df, config, individual=False)

ax.set_title("Aethalometer BC vs Filter EC - All Sites", fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Individual plots
for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    
    fig, ax = plt.subplots(figsize=(12, 6))
    plot_filter_vs_aethalometer(ax, site_name, df, config, individual=True)
    
    ax.set_title(f"Aethalometer BC vs Filter EC - {site_name}", fontsize=14, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 6. Scatter: Aethalometer vs Filter EC Correlation

In [None]:
# Combined scatter plot
fig, ax = plt.subplots(figsize=(10, 10))

print("="*80)
print("MATCHING SUMMARY - Combined Plot")
print("="*80)

for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    site_code = config['code']
    
    # Get filter EC data
    site_filters = filter_data[
        (filter_data['Site'] == site_code) & 
        (filter_data['Parameter'] == 'ChemSpec_EC_PM2.5')
    ].copy()
    
    # Filter out values below or near MDL (exclude values < 0.5 ug/m3 = 500 ng/m3)
    # These are typically blanks or below detection limit
    site_filters = site_filters[site_filters['Concentration'] >= 0.5].copy()
    
    if len(site_filters) > 0 and 'IR BCc' in df.columns:
        # Merge on date (±1 day tolerance)
        matched_data = []
        
        for _, filter_row in site_filters.iterrows():
            filter_date = filter_row['SampleDate']
            
            # Find aethalometer measurement within ±1 day
            date_match = df[
                (df['day_9am'] >= filter_date - pd.Timedelta(days=1)) &
                (df['day_9am'] <= filter_date + pd.Timedelta(days=1))
            ]
            
            if len(date_match) > 0 and date_match['IR BCc'].notna().any():
                aeth_bc = date_match['IR BCc'].mean()
                filter_ec = filter_row['Concentration'] * 1000  # ug/m3 to ng/m3
                matched_data.append({'aeth_bc': aeth_bc, 'filter_ec': filter_ec})
        
        print(f"{site_name}: {len(site_filters)} filter dates → {len(matched_data)} matched pairs")
        
        if len(matched_data) > 0:
            matched_df = pd.DataFrame(matched_data)
            ax.scatter(matched_df['aeth_bc'], matched_df['filter_ec'], 
                      color=config['color'], label=f'{site_name} (n={len(matched_data)})', 
                      alpha=0.6, s=80, edgecolors='black', linewidth=1)

# Set axis limits to start from 0,0
ax.set_xlim(left=0)
ax.set_ylim(bottom=0)

# Add 1:1 line
max_val = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, linewidth=1.5, label='1:1 line')

ax.set_xlabel('Aethalometer IR BC (ng/m³)', fontsize=12)
ax.set_ylabel('Filter EC (ng/m³)', fontsize=12)
ax.set_title('Aethalometer BC vs Filter EC - All Sites (EC ≥ 0.5 µg/m³)', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("INDIVIDUAL SCATTER PLOTS")
print("="*80 + "\n")

# Individual scatter plots
for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    site_code = config['code']
    
    # Get filter EC data
    site_filters = filter_data[
        (filter_data['Site'] == site_code) & 
        (filter_data['Parameter'] == 'ChemSpec_EC_PM2.5')
    ].copy()
    
    # Filter out values below MDL
    site_filters = site_filters[site_filters['Concentration'] >= 0.5].copy()
    
    if len(site_filters) > 0 and 'IR BCc' in df.columns:
        # Merge on date
        matched_data = []
        
        for _, filter_row in site_filters.iterrows():
            filter_date = filter_row['SampleDate']
            
            date_match = df[
                (df['day_9am'] >= filter_date - pd.Timedelta(days=1)) &
                (df['day_9am'] <= filter_date + pd.Timedelta(days=1))
            ]
            
            if len(date_match) > 0 and date_match['IR BCc'].notna().any():
                aeth_bc = date_match['IR BCc'].mean()
                filter_ec = filter_row['Concentration'] * 1000
                matched_data.append({'aeth_bc': aeth_bc, 'filter_ec': filter_ec})
        
        if len(matched_data) > 0:
            matched_df = pd.DataFrame(matched_data)
            
            print(f"{site_name}:")
            print(f"  Filter dates available (EC ≥ 0.5 µg/m³): {len(site_filters)}")
            print(f"  Matched pairs: {len(matched_df)}")
            print(f"  Aethalometer BC range: {matched_df['aeth_bc'].min():.1f} - {matched_df['aeth_bc'].max():.1f} ng/m³")
            print(f"  Filter EC range: {matched_df['filter_ec'].min():.1f} - {matched_df['filter_ec'].max():.1f} ng/m³")
            
            fig, ax = plt.subplots(figsize=(8, 8))
            
            ax.scatter(matched_df['aeth_bc'], matched_df['filter_ec'], 
                      color=config['color'], alpha=0.6, s=100, 
                      edgecolors='black', linewidth=1.5)
            
            # Calculate linear regression (line of best fit)
            if len(matched_df) > 1:
                # Linear regression: y = mx + b
                coefficients = np.polyfit(matched_df['aeth_bc'], matched_df['filter_ec'], 1)
                slope = coefficients[0]
                intercept = coefficients[1]
                
                # Create line of best fit
                x_fit = np.array([0, matched_df['aeth_bc'].max()])
                y_fit = slope * x_fit + intercept
                
                # Plot line of best fit
                ax.plot(x_fit, y_fit, color=config['color'], linestyle='-', 
                       linewidth=2.5, alpha=0.8, label='Best fit')
                
                # Calculate R²
                correlation = np.corrcoef(matched_df['aeth_bc'], matched_df['filter_ec'])[0, 1]
                r_squared = correlation ** 2
            else:
                slope = 0
                intercept = 0
                r_squared = 0
            
            # Set axis limits to start from 0,0
            ax.set_xlim(left=0)
            ax.set_ylim(bottom=0)
            
            # Add 1:1 line
            max_val = max(ax.get_xlim()[1], ax.get_ylim()[1])
            ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, linewidth=1.5, label='1:1 line')
            
            # Add text with stats and equation
            if intercept >= 0:
                equation = f'y = {slope:.3f}x + {intercept:.1f}'
            else:
                equation = f'y = {slope:.3f}x - {abs(intercept):.1f}'
            
            stats_text = f'n = {len(matched_df)}\nR² = {r_squared:.3f}\n{equation}'
            ax.text(0.05, 0.95, stats_text, 
                   transform=ax.transAxes, fontsize=11, 
                   verticalalignment='top', 
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            
            ax.set_xlabel('Aethalometer IR BC (ng/m³)', fontsize=12)
            ax.set_ylabel('Filter EC (ng/m³)', fontsize=12)
            ax.set_title(f'Aethalometer BC vs Filter EC - {site_name}', fontsize=14, fontweight='bold')
            ax.legend(loc='lower right')
            ax.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            print()
        else:
            print(f"{site_name}: No matching data")
    else:
        print(f"{site_name}: No filter EC data available")

## 7. Summary Statistics Table

In [None]:
# Create summary table
summary_data = []

for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    
    summary = {
        'Site': site_name,
        'Location': config['location'],
        'Total Days': len(df),
        'Date Range': f"{df['day_9am'].min().date()} to {df['day_9am'].max().date()}"
    }
    
    # BC statistics
    if 'IR BCc' in df.columns:
        valid_bc = df['IR BCc'].dropna()
        summary['Days with BC'] = len(valid_bc)
        summary['Mean IR BC (ng/m³)'] = f"{valid_bc.mean():.1f}"
        summary['Median IR BC (ng/m³)'] = f"{valid_bc.median():.1f}"
        summary['Max IR BC (ng/m³)'] = f"{valid_bc.max():.1f}"
    
    # Data completeness
    if 'data_completeness_pct' in df.columns:
        summary['Avg Completeness (%)'] = f"{df['data_completeness_pct'].mean():.1f}"
    
    # Filter matches
    site_filters = filter_data[filter_data['Site'] == config['code']]
    summary['Filter Samples'] = len(site_filters['FilterId'].unique())
    
    summary_data.append(summary)

summary_df = pd.DataFrame(summary_data)
summary_df

## 8. Flow 1 vs Flow 2 Analysis

This section compares the Flow 1 and Flow 2 measurements and their ratios over time.

In [None]:
def plot_bc1_bc2_comparison(ax, site_name, df, config):
    """Plot BC1 (Spot 1) and BC2 (Spot 2) time series for a site"""
    
    # Check which wavelength columns exist for BC1 and BC2
    # Try both 'smoothed' and non-smoothed versions
    bc1_cols = [col for col in df.columns if 'BC1' in col and 'smooth' in col.lower()]
    bc2_cols = [col for col in df.columns if 'BC2' in col and 'smooth' in col.lower()]
    
    # Fallback to non-smoothed if smoothed not available
    if not bc1_cols:
        bc1_cols = [col for col in df.columns if 'BC1' in col]
    if not bc2_cols:
        bc2_cols = [col for col in df.columns if 'BC2' in col]
    
    # Use IR wavelength as default
    bc1_col = None
    bc2_col = None
    
    for col in bc1_cols:
        if 'IR' in col:
            bc1_col = col
            break
    
    for col in bc2_cols:
        if 'IR' in col:
            bc2_col = col
            break
    
    # If IR not found, use first available
    if not bc1_col and bc1_cols:
        bc1_col = bc1_cols[0]
    if not bc2_col and bc2_cols:
        bc2_col = bc2_cols[0]
    
    if bc1_col and bc2_col:
        # Plot BC1 (Spot 1)
        valid_bc1 = df[df[bc1_col].notna()].copy()
        if len(valid_bc1) > 0:
            ax.plot(valid_bc1['day_9am'], valid_bc1[bc1_col],
                   color=config['color'], label=f'{site_name} - Spot 1',
                   alpha=0.7, linewidth=2, linestyle='-')
        
        # Plot BC2 (Spot 2) with different style
        valid_bc2 = df[df[bc2_col].notna()].copy()
        if len(valid_bc2) > 0:
            ax.plot(valid_bc2['day_9am'], valid_bc2[bc2_col],
                   color=config['color'], label=f'{site_name} - Spot 2',
                   alpha=0.7, linewidth=2, linestyle='--')
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('BC (ng/m³)', fontsize=12)
    ax.tick_params(axis='x', rotation=45)

# Create plots for BC1 (Spot 1) vs BC2 (Spot 2)
plot_combined_and_individual(
    plot_bc1_bc2_comparison,
    "Spot 1 vs Spot 2 BC Comparison"
)

In [None]:
# Combined plot - All sites showing only BC1/BC2 ratios
fig, ax = plt.subplots(figsize=(14, 7))

print("="*80)
print("SPOT 1/SPOT 2 BC RATIO ANALYSIS - All Sites")
print("="*80)

for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    
    # Find BC1 and BC2 columns
    bc1_cols = [col for col in df.columns if 'BC1' in col and 'smooth' in col.lower()]
    bc2_cols = [col for col in df.columns if 'BC2' in col and 'smooth' in col.lower()]
    
    # Fallback to non-smoothed
    if not bc1_cols:
        bc1_cols = [col for col in df.columns if 'BC1' in col]
    if not bc2_cols:
        bc2_cols = [col for col in df.columns if 'BC2' in col]
    
    # Use IR wavelength
    bc1_col = None
    bc2_col = None
    
    for col in bc1_cols:
        if 'IR' in col:
            bc1_col = col
            break
    
    for col in bc2_cols:
        if 'IR' in col:
            bc2_col = col
            break
    
    # Fallback to first available
    if not bc1_col and bc1_cols:
        bc1_col = bc1_cols[0]
    if not bc2_col and bc2_cols:
        bc2_col = bc2_cols[0]
    
    if bc1_col and bc2_col:
        # Calculate ratio
        df_ratio = df[[bc1_col, bc2_col, 'day_9am']].copy()
        df_ratio['ratio'] = df_ratio[bc1_col] / df_ratio[bc2_col]
        
        # Remove infinite and NaN values
        df_ratio = df_ratio[np.isfinite(df_ratio['ratio'])].copy()
        
        if len(df_ratio) > 0:
            ax.plot(df_ratio['day_9am'], df_ratio['ratio'],
                   color=config['color'], label=f'{site_name}',
                   alpha=0.7, linewidth=2, marker='o', markersize=3)
            
            # Print statistics
            print(f"\n{site_name}:")
            print(f"  Column used for BC1: {bc1_col}")
            print(f"  Column used for BC2: {bc2_col}")
            print(f"  Number of ratio measurements: {len(df_ratio)}")
            print(f"  Mean ratio: {df_ratio['ratio'].mean():.3f}")
            print(f"  Median ratio: {df_ratio['ratio'].median():.3f}")
            print(f"  Std dev: {df_ratio['ratio'].std():.3f}")
            print(f"  Range: {df_ratio['ratio'].min():.3f} - {df_ratio['ratio'].max():.3f}")
    else:
        print(f"\n{site_name}: No BC1/BC2 data available")
        print(f"  BC1 columns found: {len(bc1_cols)}")
        print(f"  BC2 columns found: {len(bc2_cols)}")

# Add reference line at ratio = 1
ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, linewidth=2, label='Ratio = 1')

ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Spot 1 / Spot 2 BC Ratio', fontsize=12)
ax.set_title('Spot 1/Spot 2 BC Ratio Over Time - All Sites', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n" + "="*80)

In [None]:
# Individual plots for each site with BC1/BC2 ratio
for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    
    # Find BC1 and BC2 columns
    bc1_cols = [col for col in df.columns if 'BC1' in col and 'smooth' in col.lower()]
    bc2_cols = [col for col in df.columns if 'BC2' in col and 'smooth' in col.lower()]
    
    # Fallback to non-smoothed
    if not bc1_cols:
        bc1_cols = [col for col in df.columns if 'BC1' in col]
    if not bc2_cols:
        bc2_cols = [col for col in df.columns if 'BC2' in col]
    
    # Use IR wavelength
    bc1_col = None
    bc2_col = None
    
    for col in bc1_cols:
        if 'IR' in col:
            bc1_col = col
            break
    
    for col in bc2_cols:
        if 'IR' in col:
            bc2_col = col
            break
    
    # Fallback to first available
    if not bc1_col and bc1_cols:
        bc1_col = bc1_cols[0]
    if not bc2_col and bc2_cols:
        bc2_col = bc2_cols[0]
    
    if bc1_col and bc2_col:
        fig, ax = plt.subplots(figsize=(12, 6))
        
        # Calculate ratio
        df_ratio = df[[bc1_col, bc2_col, 'day_9am']].copy()
        df_ratio['ratio'] = df_ratio[bc1_col] / df_ratio[bc2_col]
        
        # Remove infinite and NaN values
        df_ratio = df_ratio[np.isfinite(df_ratio['ratio'])].copy()
        
        if len(df_ratio) > 0:
            ax.plot(df_ratio['day_9am'], df_ratio['ratio'],
                   color=config['color'], label=f'{site_name}',
                   alpha=0.7, linewidth=2, marker='o', markersize=3)
            
            # Add reference line at ratio = 1
            ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, linewidth=1.5, label='Ratio = 1')
            
            ax.set_xlabel('Date', fontsize=12)
            ax.set_ylabel('Spot 1 / Spot 2 Ratio', fontsize=12)
            ax.set_title(f"Spot 1/Spot 2 BC Ratio Over Time - {site_name}", fontsize=14, fontweight='bold')
            ax.legend(loc='best')
            ax.grid(True, alpha=0.3)
            ax.tick_params(axis='x', rotation=45)
            plt.tight_layout()
            plt.show()
        else:
            print(f"{site_name}: No BC1/BC2 ratio data available")

## 9. Custom Visualization Area

Use this section to create your own custom plots!

In [None]:
def plot_flow_ratio_timeseries(ax, site_name, df, config):
    """Plot Flow 1 / Flow 2 ratio over time for each site"""
    
    # Find Flow 1 and Flow 2 columns
    flow1_cols = [col for col in df.columns if 'Flow 1' in col and 'BCc' in col]
    flow2_cols = [col for col in df.columns if 'Flow 2' in col and 'BCc' in col]
    
    # Use IR wavelength
    flow1_col = None
    flow2_col = None
    
    for col in flow1_cols:
        if 'IR' in col:
            flow1_col = col
            break
    
    for col in flow2_cols:
        if 'IR' in col:
            flow2_col = col
            break
    
    # Fallback to first available
    if not flow1_col and flow1_cols:
        flow1_col = flow1_cols[0]
    if not flow2_col and flow2_cols:
        flow2_col = flow2_cols[0]
    
    if flow1_col and flow2_col:
        # Calculate ratio
        df_ratio = df[[flow1_col, flow2_col, 'day_9am']].copy()
        df_ratio['ratio'] = df_ratio[flow1_col] / df_ratio[flow2_col]
        
        # Remove infinite and NaN values
        df_ratio = df_ratio[np.isfinite(df_ratio['ratio'])].copy()
        
        if len(df_ratio) > 0:
            ax.plot(df_ratio['day_9am'], df_ratio['ratio'],
                   color=config['color'], label=f'{site_name}',
                   alpha=0.7, linewidth=2, marker='o', markersize=4)
            
            # Add reference line at ratio = 1
            ax.axhline(y=1.0, color='black', linestyle='--', alpha=0.5, linewidth=1.5, label='Ratio = 1')
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Flow 1 / Flow 2 Ratio', fontsize=12)
    ax.tick_params(axis='x', rotation=45)

# Create individual plots for each site with Flow 1/Flow 2 ratio
for site_name, df in aethalometer_data.items():
    config = SITES[site_name]
    
    fig, ax = plt.subplots(figsize=(12, 6))
    plot_flow_ratio_timeseries(ax, site_name, df, config)
    
    ax.set_title(f"Flow 1/Flow 2 Ratio Over Time - {site_name}", fontsize=14, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_flow_comparison(ax, site_name, df, config):
    """Plot Flow 1 and Flow 2 time series for a site"""
    
    # Check which flow columns exist
    flow1_cols = [col for col in df.columns if 'Flow 1' in col and 'BCc' in col]
    flow2_cols = [col for col in df.columns if 'Flow 2' in col and 'BCc' in col]
    
    # Use IR wavelength as default
    flow1_col = None
    flow2_col = None
    
    for col in flow1_cols:
        if 'IR' in col:
            flow1_col = col
            break
    
    for col in flow2_cols:
        if 'IR' in col:
            flow2_col = col
            break
    
    # If IR not found, use first available
    if not flow1_col and flow1_cols:
        flow1_col = flow1_cols[0]
    if not flow2_col and flow2_cols:
        flow2_col = flow2_cols[0]
    
    if flow1_col and flow2_col:
        # Plot Flow 1
        valid_flow1 = df[df[flow1_col].notna()].copy()
        if len(valid_flow1) > 0:
            ax.plot(valid_flow1['day_9am'], valid_flow1[flow1_col],
                   color=config['color'], label=f'{site_name} - Flow 1',
                   alpha=0.7, linewidth=2, linestyle='-')
        
        # Plot Flow 2 with different style
        valid_flow2 = df[df[flow2_col].notna()].copy()
        if len(valid_flow2) > 0:
            ax.plot(valid_flow2['day_9am'], valid_flow2[flow2_col],
                   color=config['color'], label=f'{site_name} - Flow 2',
                   alpha=0.7, linewidth=2, linestyle='--')
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('BC (ng/m³)', fontsize=12)
    ax.tick_params(axis='x', rotation=45)

# Create plots for Flow 1 vs Flow 2
plot_combined_and_individual(
    plot_flow_comparison,
    "Flow 1 vs Flow 2 Comparison"
)

In [None]:
# Example: Your custom analysis here
# Access data via:
# - aethalometer_data['Beijing'], aethalometer_data['Delhi'], etc.
# - filter_data

# Example: Plot temperature vs BC for one site
site_name = 'JPL'  # Change this to any site
df = aethalometer_data[site_name]

if 'Sample temp (C)' in df.columns and 'IR BCc' in df.columns:
    valid = df[(df['Sample temp (C)'].notna()) & (df['IR BCc'].notna())]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    scatter = ax.scatter(valid['Sample temp (C)'], valid['IR BCc'], 
                        c=valid['day_9am'].astype(int), cmap='viridis', 
                        alpha=0.6, s=50)
    ax.set_xlabel('Temperature (°C)', fontsize=12)
    ax.set_ylabel('IR BC (ng/m³)', fontsize=12)
    ax.set_title(f'Temperature vs BC - {site_name}', fontsize=14, fontweight='bold')
    plt.colorbar(scatter, ax=ax, label='Date')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## Data Export

Export processed data or summary statistics if needed.

In [None]:
# Example: Export summary table to CSV
# summary_df.to_csv('site_summary.csv', index=False)
# print("Summary exported to site_summary.csv")