# AQI Validation & Source Reliability Analysis

**Purpose**: Compare our calculated US AQI (from OpenWeather PM2.5/PM10) with IQAir sensor-based AQI measurements to assess data source reliability and calculation accuracy.

**Analysis Structure**:
- **Section 1**: Data Loading & Preparation
- **Section 2**: Data Source Comparison Analysis  
- **Section 3**: AQI Category Analysis
- **Section 4**: Temporal Pattern Analysis
- **Section 5**: Statistical Reliability Assessment
- **Section 6**: Source Reliability Scoring
- **Section 7**: Recommendations & Summary

**Data Sources**:
- **US AQI**: Calculated using EPA formula from OpenWeather PM2.5 & PM10 data
- **IQAir AQI**: Direct sensor-based measurements from IQAir API
- **OpenWeather AQI**: Categorical scale (1-5) from OpenWeather API

**Note**: IQAir data is for validation/comparison only, NOT for model training.


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from datetime import datetime, timedelta
import warnings
from scipy import stats
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import ttest_rel, wilcoxon, shapiro, pearsonr, spearmanr
from sklearn.metrics import confusion_matrix, classification_report
import json

warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

print("📊 Validation Analysis Environment Setup Complete")
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


📊 Validation Analysis Environment Setup Complete
Analysis Date: 2025-07-17 22:39:56


## Section 1: Data Loading & Preparation

Load all validation CSV files and create a consolidated dataset for analysis.


In [2]:
# Load all validation CSV files
data_dir = 'data/'
validation_files = glob.glob(os.path.join(data_dir, 'aqi_validation_current_*.csv'))

print(f"Found {len(validation_files)} validation files")
if validation_files:
    print(f"Date range: {min(validation_files).split('_')[-2]} to {max(validation_files).split('_')[-2]}")

# Load and concatenate all validation data
validation_data = []
failed_files = []

for file in validation_files:
    try:
        df = pd.read_csv(file)
        if not df.empty and len(df.columns) == 5:  # Ensure proper structure
            validation_data.append(df)
        else:
            failed_files.append(file)
    except Exception as e:
        failed_files.append(file)
        print(f"Failed to load {file}: {e}")

# Combine all data
if validation_data:
    df_validation = pd.concat(validation_data, ignore_index=True)
    
    # Convert time column to datetime
    df_validation['time'] = pd.to_datetime(df_validation['time'])
    
    # Sort by time
    df_validation = df_validation.sort_values('time').reset_index(drop=True)
    
    print(f"\n✅ Successfully loaded {len(df_validation)} validation records")
    print(f"📅 Time range: {df_validation['time'].min()} to {df_validation['time'].max()}")
    print(f"⏱️  Duration: {(df_validation['time'].max() - df_validation['time'].min()).days} days")
    
    if failed_files:
        print(f"⚠️  Failed to load {len(failed_files)} files")
else:
    print("❌ No validation data loaded successfully")
    raise ValueError("Unable to load validation data")

# Display basic info
print("\n📋 Dataset Info:")
print(df_validation.info())
print("\n📊 First few records:")
df_validation.head()


Found 343 validation files
Date range: 20250701 to 20250716

✅ Successfully loaded 343 validation records
📅 Time range: 2025-07-01 23:33:58+00:00 to 2025-07-16 22:10:59+00:00
⏱️  Duration: 14 days

📋 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   time             343 non-null    datetime64[ns, UTC]
 1   openweather_aqi  343 non-null    int64              
 2   us_aqi           337 non-null    float64            
 3   iqair_aqi        343 non-null    int64              
 4   abs_deviation    337 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(2), int64(2)
memory usage: 13.5 KB
None

📊 First few records:


Unnamed: 0,time,openweather_aqi,us_aqi,iqair_aqi,abs_deviation
0,2025-07-01 23:33:58+00:00,4,97.0,64,33.0
1,2025-07-02 00:52:34+00:00,4,96.0,59,37.0
2,2025-07-02 02:42:45+00:00,4,93.0,85,8.0
3,2025-07-02 03:44:55+00:00,4,92.0,92,0.0
4,2025-07-02 04:21:12+00:00,4,92.0,89,3.0


In [3]:
# Data quality checks
print("🔍 Data Quality Assessment:")
print(f"Total records: {len(df_validation)}")
print(f"Missing values:")
print(df_validation.isnull().sum())
print(f"\nDuplicates: {df_validation.duplicated().sum()}")
print(f"\nData types:")
print(df_validation.dtypes)

# Basic statistics
print("\n📈 Descriptive Statistics:")
df_validation.describe()


🔍 Data Quality Assessment:
Total records: 343
Missing values:
time               0
openweather_aqi    0
us_aqi             6
iqair_aqi          0
abs_deviation      6
dtype: int64

Duplicates: 0

Data types:
time               datetime64[ns, UTC]
openweather_aqi                  int64
us_aqi                         float64
iqair_aqi                        int64
abs_deviation                  float64
dtype: object

📈 Descriptive Statistics:


Unnamed: 0,openweather_aqi,us_aqi,iqair_aqi,abs_deviation
count,343.0,337.0,343.0,337.0
mean,3.577259,107.311573,76.069971,36.011869
std,0.528992,28.090007,21.94706,27.903637
min,3.0,58.0,28.0,0.0
25%,3.0,83.0,62.0,13.0
50%,4.0,106.0,70.0,29.0
75%,4.0,129.0,87.0,54.0
max,5.0,166.0,160.0,131.0


In [None]:
# Data cleaning for analysis - remove rows with missing values
print("🧹 Cleaning data for analysis...")
print(f"Before cleaning: {len(df_validation)} records")
print("Missing values by column:")
print(df_validation.isnull().sum())

# Create clean dataset without missing values
df_clean = df_validation.dropna(subset=['us_aqi', 'iqair_aqi', 'openweather_aqi', 'abs_deviation']).copy()

print(f"\\nAfter cleaning: {len(df_clean)} records")
print(f"Removed: {len(df_validation) - len(df_clean)} records ({((len(df_validation) - len(df_clean))/len(df_validation)*100):.1f}%)")

if len(df_clean) == 0:
    raise ValueError("❌ No complete records available for analysis!")
else:
    print("✅ Clean dataset ready for analysis")


## Section 2: Data Source Comparison Analysis

Compare US AQI calculations with IQAir sensor measurements to understand differences and patterns.


In [None]:
# Calculate correlation metrics
correlation_us_iqair = df_validation['us_aqi'].corr(df_validation['iqair_aqi'])
#correlation_ow_iqair = df_validation['openweather_aqi'].corr(df_validation['iqair_aqi'])
#correlation_us_ow = df_validation['us_aqi'].corr(df_validation['openweather_aqi'])

print("🔗 Correlation Analysis:")
print(f"US AQI vs IQAir AQI: {correlation_us_iqair:.4f}")
#print(f"OpenWeather AQI vs IQAir AQI: {correlation_ow_iqair:.4f}")
#print(f"US AQI vs OpenWeather AQI: {correlation_us_ow:.4f}")

# Calculate performance metrics for US AQI vs IQAir
mae_us_iqair = mean_absolute_error(df_validation['iqair_aqi'], df_validation['us_aqi'])
rmse_us_iqair = np.sqrt(mean_squared_error(df_validation['iqair_aqi'], df_validation['us_aqi']))
r2_us_iqair = r2_score(df_validation['iqair_aqi'], df_validation['us_aqi'])

print(f"\n📊 US AQI vs IQAir Performance:")
print(f"Mean Absolute Error: {mae_us_iqair:.2f}")
print(f"Root Mean Square Error: {rmse_us_iqair:.2f}")
print(f"R² Score: {r2_us_iqair:.4f}")
print(f"Mean Absolute Deviation: {df_validation['abs_deviation'].mean():.2f}")
print(f"Median Absolute Deviation: {df_validation['abs_deviation'].median():.2f}")


🔗 Correlation Analysis:
US AQI vs IQAir AQI: 0.1489
OpenWeather AQI vs IQAir AQI: 0.2514
US AQI vs OpenWeather AQI: 0.5634


ValueError: Input contains NaN.

In [None]:
# Create comprehensive comparison visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('AQI Data Source Comparison Analysis', fontsize=16, fontweight='bold')

# 1. Scatter plot: US AQI vs IQAir AQI
axes[0,0].scatter(df_validation['iqair_aqi'], df_validation['us_aqi'], alpha=0.6, color='blue')
max_val = max(df_validation['iqair_aqi'].max(), df_validation['us_aqi'].max())
axes[0,0].plot([0, max_val], [0, max_val], 'r--', label='Perfect Agreement')
axes[0,0].set_xlabel('IQAir AQI (Sensor)')
axes[0,0].set_ylabel('US AQI (Calculated)')
axes[0,0].set_title(f'US AQI vs IQAir AQI\\n(r = {correlation_us_iqair:.3f})')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Time series comparison
axes[0,1].plot(df_validation['time'], df_validation['us_aqi'], label='US AQI', alpha=0.8)
axes[0,1].plot(df_validation['time'], df_validation['iqair_aqi'], label='IQAir AQI', alpha=0.8)
axes[0,1].set_xlabel('Time')
axes[0,1].set_ylabel('AQI Value')
axes[0,1].set_title('AQI Time Series Comparison')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Deviation distribution
axes[0,2].hist(df_validation['abs_deviation'], bins=30, alpha=0.7, color='green', edgecolor='black')
axes[0,2].axvline(df_validation['abs_deviation'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {df_validation["abs_deviation"].mean():.1f}')
axes[0,2].axvline(df_validation['abs_deviation'].median(), color='orange', linestyle='--', 
                  label=f'Median: {df_validation["abs_deviation"].median():.1f}')
axes[0,2].set_xlabel('Absolute Deviation')
axes[0,2].set_ylabel('Frequency')
axes[0,2].set_title('Deviation Distribution')
axes[0,2].legend()
axes[0,2].grid(True, alpha=0.3)

# 4. Box plot comparison
data_for_box = [df_validation['us_aqi'], df_validation['iqair_aqi']]
axes[1,0].boxplot(data_for_box, labels=['US AQI', 'IQAir AQI'])
axes[1,0].set_ylabel('AQI Value')
axes[1,0].set_title('AQI Distribution Comparison')
axes[1,0].grid(True, alpha=0.3)

# 5. Deviation over time
axes[1,1].plot(df_validation['time'], df_validation['abs_deviation'], color='purple', alpha=0.7)
axes[1,1].axhline(df_validation['abs_deviation'].mean(), color='red', linestyle='--', alpha=0.8)
axes[1,1].set_xlabel('Time')
axes[1,1].set_ylabel('Absolute Deviation')
axes[1,1].set_title('Deviation Over Time')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].tick_params(axis='x', rotation=45)

# 6. OpenWeather AQI vs others
axes[1,2].scatter(df_validation['openweather_aqi'], df_validation['us_aqi'], 
                  alpha=0.6, color='orange', label='OW vs US AQI')
axes[1,2].scatter(df_validation['openweather_aqi'], df_validation['iqair_aqi'], 
                  alpha=0.6, color='green', label='OW vs IQAir AQI')
axes[1,2].set_xlabel('OpenWeather AQI (1-5 scale)')
axes[1,2].set_ylabel('AQI Value')
axes[1,2].set_title('OpenWeather AQI Comparison')
axes[1,2].legend()
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Section 3: AQI Category Analysis

Analyze how well the different AQI measurements agree on air quality categories.


In [None]:
# Define AQI categories
def categorize_aqi(aqi_value):
    """Convert AQI value to category"""
    if aqi_value <= 50:
        return 'Good'
    elif aqi_value <= 100:
        return 'Moderate'
    elif aqi_value <= 150:
        return 'Unhealthy for Sensitive'
    elif aqi_value <= 200:
        return 'Unhealthy'
    elif aqi_value <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

# Apply categorization
df_validation['us_aqi_category'] = df_validation['us_aqi'].apply(categorize_aqi)
df_validation['iqair_aqi_category'] = df_validation['iqair_aqi'].apply(categorize_aqi)

print("📊 AQI Category Distribution:")
print("\\nUS AQI Categories:")
print(df_validation['us_aqi_category'].value_counts())
print("\\nIQAir AQI Categories:")
print(df_validation['iqair_aqi_category'].value_counts())

In [None]:
# Category agreement analysis
category_agreement = (df_validation['us_aqi_category'] == df_validation['iqair_aqi_category']).mean()
print(f"🎯 Category Agreement (US vs IQAir): {category_agreement:.1%}")

# Get all unique categories
all_categories = sorted(set(df_validation['us_aqi_category'].unique()) | 
                       set(df_validation['iqair_aqi_category'].unique()))

cm = confusion_matrix(df_validation['iqair_aqi_category'], df_validation['us_aqi_category'], 
                      labels=all_categories)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=all_categories, yticklabels=all_categories)
plt.title('AQI Category Confusion Matrix\\n(IQAir vs US AQI)')
plt.xlabel('US AQI Category (Predicted)')
plt.ylabel('IQAir AQI Category (Actual)')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Classification report
print("\\n📋 Classification Report (US AQI vs IQAir):")
print(classification_report(df_validation['iqair_aqi_category'], df_validation['us_aqi_category']))


In [None]:
# Category distribution visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('AQI Category Distributions', fontsize=16, fontweight='bold')

# US AQI categories
us_counts = df_validation['us_aqi_category'].value_counts()
axes[0].bar(us_counts.index, us_counts.values, color='skyblue', alpha=0.8)
axes[0].set_title('US AQI Categories')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# IQAir AQI categories
iqair_counts = df_validation['iqair_aqi_category'].value_counts()
axes[1].bar(iqair_counts.index, iqair_counts.values, color='lightcoral', alpha=0.8)
axes[1].set_title('IQAir AQI Categories')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)


plt.tight_layout()
plt.show()


## Section 4: Temporal Pattern Analysis

Analyze how the agreement between data sources varies over time and identify patterns.


In [None]:
# Add temporal features
df_validation['hour'] = df_validation['time'].dt.hour
df_validation['day_of_week'] = df_validation['time'].dt.dayofweek
df_validation['date'] = df_validation['time'].dt.date

# Calculate daily statistics
daily_stats = df_validation.groupby('date').agg({
    'us_aqi': ['mean', 'std', 'min', 'max'],
    'iqair_aqi': ['mean', 'std', 'min', 'max'],
    'abs_deviation': ['mean', 'std', 'min', 'max'],
    'time': 'count'
}).round(2)

daily_stats.columns = ['_'.join(col).strip() for col in daily_stats.columns]
daily_stats = daily_stats.rename(columns={'time_count': 'records_per_day'})

print("📅 Daily Statistics Summary:")
print(f"Average records per day: {daily_stats['records_per_day'].mean():.1f}")
print(f"Days with data: {len(daily_stats)}")
print("\\nDaily deviation statistics:")
print(daily_stats[['abs_deviation_mean', 'abs_deviation_std', 'abs_deviation_min', 'abs_deviation_max']].describe())


In [None]:
# Hourly pattern analysis
hourly_stats = df_validation.groupby('hour').agg({
    'us_aqi': 'mean',
    'iqair_aqi': 'mean',
    'abs_deviation': 'mean',
    'time': 'count'
}).round(2)

# Visualization of temporal patterns
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Temporal Pattern Analysis', fontsize=16, fontweight='bold')

# 1. Hourly AQI patterns
axes[0,0].plot(hourly_stats.index, hourly_stats['us_aqi'], marker='o', label='US AQI', linewidth=2)
axes[0,0].plot(hourly_stats.index, hourly_stats['iqair_aqi'], marker='s', label='IQAir AQI', linewidth=2)
axes[0,0].set_xlabel('Hour of Day')
axes[0,0].set_ylabel('Mean AQI')
axes[0,0].set_title('Hourly AQI Patterns')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)
axes[0,0].set_xticks(range(0, 24, 2))

# 2. Hourly deviation pattern
axes[0,1].plot(hourly_stats.index, hourly_stats['abs_deviation'], 
               marker='o', color='red', linewidth=2)
axes[0,1].axhline(hourly_stats['abs_deviation'].mean(), color='black', 
                  linestyle='--', alpha=0.8, label='Overall Mean')
axes[0,1].set_xlabel('Hour of Day')
axes[0,1].set_ylabel('Mean Absolute Deviation')
axes[0,1].set_title('Hourly Deviation Pattern')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)
axes[0,1].set_xticks(range(0, 24, 2))

# 3. Daily deviation trend
daily_stats_reset = daily_stats.reset_index()
axes[1,0].plot(daily_stats_reset['date'], daily_stats_reset['abs_deviation_mean'], 
               marker='o', alpha=0.7, linewidth=2)
axes[1,0].fill_between(daily_stats_reset['date'], 
                       daily_stats_reset['abs_deviation_mean'] - daily_stats_reset['abs_deviation_std'],
                       daily_stats_reset['abs_deviation_mean'] + daily_stats_reset['abs_deviation_std'],
                       alpha=0.3)
axes[1,0].set_xlabel('Date')
axes[1,0].set_ylabel('Daily Mean Deviation ± STD')
axes[1,0].set_title('Daily Deviation Trend')
axes[1,0].grid(True, alpha=0.3)
axes[1,0].tick_params(axis='x', rotation=45)

# 4. Data availability by hour
axes[1,1].bar(hourly_stats.index, hourly_stats['time'], color='green', alpha=0.7)
axes[1,1].set_xlabel('Hour of Day')
axes[1,1].set_ylabel('Number of Records')
axes[1,1].set_title('Data Availability by Hour')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].set_xticks(range(0, 24, 2))

plt.tight_layout()
plt.show()

print(f"\\n⏰ Best Agreement Hours: {hourly_stats['abs_deviation'].nsmallest(3).index.tolist()}")
print(f"🚨 Worst Agreement Hours: {hourly_stats['abs_deviation'].nlargest(3).index.tolist()}")


## Section 5: Statistical Reliability Assessment

Perform statistical tests to assess the reliability and bias of different data sources.


In [None]:
# Statistical tests
print("🧪 Statistical Reliability Tests:")
print("=" * 50)

# 1. Paired t-test for systematic bias
tstat, ttest_p = ttest_rel(df_validation['us_aqi'], df_validation['iqair_aqi'])
print(f"\\n1. Paired t-test (Systematic Bias):")
print(f"   t-statistic: {tstat:.4f}")
print(f"   p-value: {ttest_p:.6f}")
if ttest_p < 0.05:
    print(f"   ❌ Significant systematic bias detected (p < 0.05)")
else:
    print(f"   ✅ No significant systematic bias (p ≥ 0.05)")

# 2. Wilcoxon signed-rank test (non-parametric)
wstat, wilcox_p = wilcoxon(df_validation['us_aqi'], df_validation['iqair_aqi'])
print(f"\\n2. Wilcoxon Signed-Rank Test:")
print(f"   W-statistic: {wstat:.0f}")
print(f"   p-value: {wilcox_p:.6f}")
if wilcox_p < 0.05:
    print(f"   ❌ Significant difference in distributions (p < 0.05)")
else:
    print(f"   ✅ No significant difference in distributions (p ≥ 0.05)")

# 3. Normality tests for residuals
residuals = df_validation['us_aqi'] - df_validation['iqair_aqi']
shapiro_stat, shapiro_p = shapiro(residuals)
print(f"\\n3. Shapiro-Wilk Test (Residuals Normality):")
print(f"   W-statistic: {shapiro_stat:.4f}")
print(f"   p-value: {shapiro_p:.6f}")
if shapiro_p < 0.05:
    print(f"   ❌ Residuals not normally distributed (p < 0.05)")
else:
    print(f"   ✅ Residuals normally distributed (p ≥ 0.05)")

# 4. Correlation significance tests
pearson_r, pearson_p = pearsonr(df_validation['us_aqi'], df_validation['iqair_aqi'])
spearman_r, spearman_p = spearmanr(df_validation['us_aqi'], df_validation['iqair_aqi'])

print(f"\\n4. Correlation Tests:")
print(f"   Pearson r: {pearson_r:.4f} (p = {pearson_p:.6f})")
print(f"   Spearman ρ: {spearman_r:.4f} (p = {spearman_p:.6f})")

# 5. Bias and precision metrics
bias = (df_validation['us_aqi'] - df_validation['iqair_aqi']).mean()
precision = (df_validation['us_aqi'] - df_validation['iqair_aqi']).std()
mape = np.mean(np.abs((df_validation['iqair_aqi'] - df_validation['us_aqi']) / df_validation['iqair_aqi'])) * 100

print(f"\\n5. Bias & Precision Analysis:")
print(f"   Mean Bias: {bias:.2f} AQI units")
print(f"   Precision (SD): {precision:.2f} AQI units")
print(f"   MAPE: {mape:.1f}%")

if abs(bias) < 5:
    print(f"   ✅ Low bias (|bias| < 5 AQI units)")
else:
    print(f"   ⚠️  Moderate bias (|bias| ≥ 5 AQI units)")

if precision < 15:
    print(f"   ✅ Good precision (SD < 15 AQI units)")
else:
    print(f"   ⚠️  Moderate precision (SD ≥ 15 AQI units)")


In [None]:
# Bias analysis visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Statistical Reliability Assessment', fontsize=16, fontweight='bold')

# 1. Residuals distribution
axes[0,0].hist(residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].axvline(residuals.mean(), color='red', linestyle='--', 
                  label=f'Mean: {residuals.mean():.2f}')
axes[0,0].axvline(0, color='green', linestyle='-', label='Perfect Agreement')
axes[0,0].set_xlabel('Residuals (US AQI - IQAir AQI)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Residuals Distribution')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Q-Q plot for residuals
stats.probplot(residuals, dist="norm", plot=axes[0,1])
axes[0,1].set_title('Q-Q Plot: Residuals vs Normal Distribution')
axes[0,1].grid(True, alpha=0.3)

# 3. Bland-Altman plot
mean_values = (df_validation['us_aqi'] + df_validation['iqair_aqi']) / 2
diff_values = df_validation['us_aqi'] - df_validation['iqair_aqi']

axes[1,0].scatter(mean_values, diff_values, alpha=0.6)
axes[1,0].axhline(diff_values.mean(), color='red', linestyle='--', 
                  label=f'Mean Diff: {diff_values.mean():.2f}')
axes[1,0].axhline(diff_values.mean() + 1.96*diff_values.std(), color='orange', 
                  linestyle='--', label=f'+1.96 SD: {diff_values.mean() + 1.96*diff_values.std():.2f}')
axes[1,0].axhline(diff_values.mean() - 1.96*diff_values.std(), color='orange', 
                  linestyle='--', label=f'-1.96 SD: {diff_values.mean() - 1.96*diff_values.std():.2f}')
axes[1,0].axhline(0, color='green', linestyle='-', alpha=0.7)
axes[1,0].set_xlabel('Mean AQI ((US + IQAir) / 2)')
axes[1,0].set_ylabel('Difference (US - IQAir)')
axes[1,0].set_title('Bland-Altman Plot')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# 4. Rolling bias over time
df_validation_sorted = df_validation.sort_values('time').reset_index(drop=True)
window_size = min(24, len(df_validation_sorted) // 10)  # Adaptive window size
rolling_bias = (df_validation_sorted['us_aqi'] - df_validation_sorted['iqair_aqi']).rolling(window=window_size, center=True).mean()

axes[1,1].plot(df_validation_sorted['time'], rolling_bias, color='purple', linewidth=2)
axes[1,1].axhline(0, color='green', linestyle='-', alpha=0.7, label='No Bias')
axes[1,1].axhline(bias, color='red', linestyle='--', alpha=0.8, label=f'Overall Bias: {bias:.2f}')
axes[1,1].set_xlabel('Time')
axes[1,1].set_ylabel(f'Rolling Bias (Window={window_size})')
axes[1,1].set_title('Temporal Bias Trend')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


## Section 6: Source Reliability Scoring

Create a comprehensive scoring system to evaluate the reliability of different data sources.


In [None]:
# Calculate reliability metrics
def calculate_reliability_score(mae, rmse, correlation, bias, precision, agreement_rate):
    """
    Calculate reliability score based on multiple metrics
    Score ranges from 0-100 (higher is better)
    """
    # Normalize metrics (0-1 scale, higher is better)
    
    # MAE score (lower is better) - assuming max reasonable MAE is 100
    mae_score = max(0, 1 - mae / 100)
    
    # RMSE score (lower is better) - assuming max reasonable RMSE is 150
    rmse_score = max(0, 1 - rmse / 150)
    
    # Correlation score (higher is better)
    corr_score = max(0, correlation)
    
    # Bias score (lower absolute bias is better) - assuming max reasonable bias is 50
    bias_score = max(0, 1 - abs(bias) / 50)
    
    # Precision score (lower is better) - assuming max reasonable precision is 50
    precision_score = max(0, 1 - precision / 50)
    
    # Agreement rate score
    agreement_score = agreement_rate
    
    # Weighted average (adjust weights based on importance)
    weights = {
        'mae': 0.2,
        'rmse': 0.2,
        'correlation': 0.25,
        'bias': 0.15,
        'precision': 0.1,
        'agreement': 0.1
    }
    
    total_score = (
        weights['mae'] * mae_score +
        weights['rmse'] * rmse_score +
        weights['correlation'] * corr_score +
        weights['bias'] * bias_score +
        weights['precision'] * precision_score +
        weights['agreement'] * agreement_score
    ) * 100
    
    return total_score, {
        'mae_score': mae_score * 100,
        'rmse_score': rmse_score * 100,
        'correlation_score': corr_score * 100,
        'bias_score': bias_score * 100,
        'precision_score': precision_score * 100,
        'agreement_score': agreement_score * 100
    }

# Calculate reliability score for US AQI vs IQAir
us_reliability_score, us_components = calculate_reliability_score(
    mae=mae_us_iqair,
    rmse=rmse_us_iqair,
    correlation=correlation_us_iqair,
    bias=bias,
    precision=precision,
    agreement_rate=category_agreement
)

print("🏆 DATA SOURCE RELIABILITY ASSESSMENT")
print("=" * 50)
print(f"\\n📊 US AQI (OpenWeather PM-based) vs IQAir Sensor:")
print(f"   Overall Reliability Score: {us_reliability_score:.1f}/100")
print(f"\\n   Component Scores:")
for component, score in us_components.items():
    print(f"   • {component.replace('_', ' ').title()}: {score:.1f}/100")

# Interpretation
if us_reliability_score >= 80:
    reliability_grade = "Excellent (A)"
    reliability_emoji = "🟢"
elif us_reliability_score >= 70:
    reliability_grade = "Good (B)"
    reliability_emoji = "🟡"
elif us_reliability_score >= 60:
    reliability_grade = "Fair (C)"
    reliability_emoji = "🟠"
else:
    reliability_grade = "Poor (D/F)"
    reliability_emoji = "🔴"

print(f"\\n{reliability_emoji} Reliability Grade: {reliability_grade}")


In [None]:
# Reliability score visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Data Source Reliability Assessment', fontsize=16, fontweight='bold')

# 1. Component scores radar chart
components = list(us_components.keys())
scores = list(us_components.values())

# Create radar chart data
angles = np.linspace(0, 2*np.pi, len(components), endpoint=False).tolist()
scores_plot = scores + [scores[0]]  # Complete the circle
angles_plot = angles + [angles[0]]  # Complete the circle

ax_radar = plt.subplot(1, 2, 1, projection='polar')
ax_radar.plot(angles_plot, scores_plot, 'o-', linewidth=2, color='blue')
ax_radar.fill(angles_plot, scores_plot, alpha=0.25, color='blue')
ax_radar.set_xticks(angles)
ax_radar.set_xticklabels([comp.replace('_', '\\n').title() for comp in components])
ax_radar.set_ylim(0, 100)
ax_radar.set_title('Reliability Component Scores\\n(US AQI vs IQAir)', y=1.08)
ax_radar.grid(True)

# 2. Overall score gauge
ax_gauge = axes[1]
theta = np.linspace(0, np.pi, 100)
r = np.ones_like(theta)

# Create color segments
colors = ['red', 'orange', 'yellow', 'lightgreen', 'green']
bounds = [0, 40, 60, 70, 80, 100]

for i in range(len(colors)):
    start_angle = np.pi * (1 - bounds[i+1]/100)
    end_angle = np.pi * (1 - bounds[i]/100)
    theta_seg = np.linspace(start_angle, end_angle, 20)
    r_seg = np.ones_like(theta_seg)
    ax_gauge.fill_between(theta_seg, 0, r_seg, color=colors[i], alpha=0.7)

# Add score indicator
score_angle = np.pi * (1 - us_reliability_score/100)
ax_gauge.plot([score_angle, score_angle], [0, 1], 'black', linewidth=4)
ax_gauge.plot(score_angle, 1, 'ko', markersize=10)

ax_gauge.set_xlim(0, np.pi)
ax_gauge.set_ylim(0, 1.2)
ax_gauge.set_xticks([0, np.pi/4, np.pi/2, 3*np.pi/4, np.pi])
ax_gauge.set_xticklabels(['100', '75', '50', '25', '0'])
ax_gauge.set_yticks([])
ax_gauge.set_title(f'Overall Reliability Score\\n{us_reliability_score:.1f}/100 ({reliability_grade})')
ax_gauge.text(np.pi/2, 0.5, f'{us_reliability_score:.1f}', 
              ha='center', va='center', fontsize=20, fontweight='bold')

plt.tight_layout()
plt.show()


## Section 7: Recommendations & Summary

Provide actionable recommendations based on the validation analysis.


In [None]:
# Generate comprehensive summary
print("📋 VALIDATION ANALYSIS SUMMARY REPORT")
print("=" * 60)

print(f"\\n📊 Dataset Overview:")
print(f"   • Total validation records: {len(df_validation):,}")
print(f"   • Time period: {df_validation['time'].min().strftime('%Y-%m-%d')} to {df_validation['time'].max().strftime('%Y-%m-%d')}")
print(f"   • Duration: {(df_validation['time'].max() - df_validation['time'].min()).days} days")
print(f"   • Data completeness: {(1 - df_validation.isnull().sum().sum() / (len(df_validation) * len(df_validation.columns))) * 100:.1f}%")

print(f"\\n🎯 Performance Metrics (US AQI vs IQAir):")
print(f"   • Correlation (Pearson): {correlation_us_iqair:.3f}")
print(f"   • Mean Absolute Error: {mae_us_iqair:.1f} AQI units")
print(f"   • Root Mean Square Error: {rmse_us_iqair:.1f} AQI units")
print(f"   • R² Score: {r2_us_iqair:.3f}")
print(f"   • Category Agreement: {category_agreement:.1%}")

print(f"\\n📈 Bias Analysis:")
print(f"   • Mean Bias: {bias:.1f} AQI units")
print(f"   • Precision (SD): {precision:.1f} AQI units")
print(f"   • MAPE: {mape:.1f}%")
print(f"   • Median Absolute Deviation: {df_validation['abs_deviation'].median():.1f}")

print(f"\\n🏆 Overall Assessment:")
print(f"   • Reliability Score: {us_reliability_score:.1f}/100")
print(f"   • Reliability Grade: {reliability_grade}")
print(f"   • Statistical Significance: {'Yes' if ttest_p < 0.05 else 'No'} (p = {ttest_p:.4f})")

# Detailed recommendations
print(f"\\n💡 RECOMMENDATIONS:")
print(f"\\n1. 🎯 Model Training Approach:")
if us_reliability_score >= 70:
    print(f"   ✅ PROCEED with OpenWeather PM-based AQI calculation for training")
    print(f"   • High reliability score ({us_reliability_score:.1f}/100) indicates good agreement")
    print(f"   • US EPA formula application is working well with OpenWeather PM data")
else:
    print(f"   ⚠️  CAUTION with OpenWeather PM-based AQI calculation")
    print(f"   • Moderate reliability score ({us_reliability_score:.1f}/100) suggests potential issues")
    print(f"   • Consider additional validation or data source investigation")

print(f"\\n2. 🕐 Temporal Considerations:")
best_hours = hourly_stats['abs_deviation'].nsmallest(3).index.tolist()
worst_hours = hourly_stats['abs_deviation'].nlargest(3).index.tolist()
print(f"   • Best agreement hours: {best_hours} (consider weighting these more heavily)")
print(f"   • Worst agreement hours: {worst_hours} (consider additional validation)")
print(f"   • Time-based features may improve model performance")

print(f"\\n3. 🎲 Statistical Insights:")
if abs(bias) < 5:
    print(f"   ✅ Low systematic bias ({bias:.1f} units) - minimal correction needed")
else:
    print(f"   ⚠️  Moderate systematic bias ({bias:.1f} units) - consider bias correction")

if precision < 15:
    print(f"   ✅ Good precision ({precision:.1f} units) - consistent measurements")
else:
    print(f"   ⚠️  Moderate precision ({precision:.1f} units) - consider ensemble methods")

print(f"\\n4. 📊 Model Development Strategy:")
print(f"   • Use PM2.5 & PM10 concentrations as primary features (validated approach)")
print(f"   • Include temporal features (hour, day of week) for time-dependent patterns")
print(f"   • Consider bias correction in post-processing if systematic bias detected")
print(f"   • Implement robust validation using time-based splits")
print(f"   • Monitor model performance against IQAir data in production")

print(f"\\n5. 🚨 Quality Assurance:")
print(f"   • Set up automated validation against IQAir data for ongoing monitoring")
print(f"   • Alert if deviation exceeds {df_validation['abs_deviation'].quantile(0.95):.0f} AQI units (95th percentile)")
print(f"   • Regular recalibration if bias drift is detected")
print(f"   • Maintain separate validation dataset for model evaluation")

print(f"\\n✅ CONCLUSION:")
if us_reliability_score >= 70 and category_agreement >= 0.7:
    conclusion = "OpenWeather PM-based AQI calculation is RELIABLE for model training"
    confidence = "HIGH"
elif us_reliability_score >= 60 and category_agreement >= 0.6:
    conclusion = "OpenWeather PM-based AQI calculation is ACCEPTABLE with monitoring"
    confidence = "MODERATE"
else:
    conclusion = "OpenWeather PM-based AQI calculation needs IMPROVEMENT"
    confidence = "LOW"

print(f"   {conclusion}")
print(f"   Confidence Level: {confidence}")
print(f"   Recommended for 3-day AQI forecasting: {'YES' if confidence != 'LOW' else 'WITH CAUTION'}")


In [None]:
# Export summary statistics for reference
summary_stats = {
    'validation_records': len(df_validation),
    'time_range_start': df_validation['time'].min().isoformat(),
    'time_range_end': df_validation['time'].max().isoformat(),
    'correlation_us_iqair': float(correlation_us_iqair),
    'mae_us_iqair': float(mae_us_iqair),
    'rmse_us_iqair': float(rmse_us_iqair),
    'r2_score': float(r2_us_iqair),
    'category_agreement': float(category_agreement),
    'mean_bias': float(bias),
    'precision_sd': float(precision),
    'mape_percent': float(mape),
    'reliability_score': float(us_reliability_score),
    'reliability_grade': reliability_grade,
    'statistical_significance_p': float(ttest_p),
    'best_agreement_hours': best_hours,
    'worst_agreement_hours': worst_hours,
    'recommended_for_training': confidence != 'LOW'
}

# Save to file
with open('validation_summary_statistics.json', 'w') as f:
    json.dump(summary_stats, f, indent=2, default=str)

print(f"\\n📁 Summary statistics exported to: validation_summary_statistics.json")
print(f"\\n🎉 Validation Analysis Complete!")
print(f"   Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"   Ready for model development phase: {'YES' if confidence != 'LOW' else 'WITH MONITORING'}")

# Display key takeaways
print(f"\\n🔑 KEY TAKEAWAYS:")
print(f"   1. Your US AQI calculation shows {reliability_grade.lower()} agreement with IQAir sensors")
print(f"   2. Mean deviation is {df_validation['abs_deviation'].mean():.1f} AQI units - {'reasonable' if df_validation['abs_deviation'].mean() < 20 else 'moderate'}")
print(f"   3. Category agreement is {category_agreement:.1%} - {'good' if category_agreement > 0.7 else 'moderate' if category_agreement > 0.6 else 'needs improvement'}")
print(f"   4. {'Strong' if correlation_us_iqair > 0.7 else 'Moderate' if correlation_us_iqair > 0.5 else 'Weak'} correlation (r = {correlation_us_iqair:.3f}) indicates {'good' if correlation_us_iqair > 0.7 else 'acceptable' if correlation_us_iqair > 0.5 else 'poor'} linear relationship")
print(f"   5. This validation supports using OpenWeather PM data for {'reliable' if confidence == 'HIGH' else 'monitored' if confidence == 'MODERATE' else 'cautious'} AQI forecasting")
