<h1><center>Urban Digital Twin-based Framework for Air Quality Mapping in Sofia, Bulgaria</center></h1>

This Jupyter Note Book goes through the steps for setting up an urban digital twin for air pollution mapping including the collection and processing of data, calibration of models, setting up of data platform, and evaluation of final model

<center>Step 1: Initialize requirements</center>

In [2]:
# Set file paths
data_folder = r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\VF'
# Independant variable rasters
f_buildheight = data_folder + r'\vf_buildheight.tif'
f_elevation = data_folder + r'\vf_elevation.tif'
f_landuse = data_folder + r'\vf_landuse.tif'
f_ndvi = data_folder + r'\vf_ndvi.tif'
f_pop = data_folder + r'\vf_pop.tif'
f_motor_trunk = data_folder + r'\vf_motorway_trunk.tif'
f_primary = data_folder + r'\vf_primary.tif'
f_residential = data_folder + r'\vf_residential.tif'
f_secondary = data_folder + r'\vf_secondary.tif'
f_tertiary = data_folder + r'\vf_tertiary.tif'
# Independent variable vectors
f_busstop = data_folder + r'\vf_busstop.gpkg'
f_mtp = data_folder + r'\vf_mtp.gpkg'
# Precalculated Rasters
f_roaddis_precalc = data_folder + r'\precalc\mtp50.tif'
f_busstop_precalc = data_folder + r'\precalc\busstop50.tif'
f_elevation_precalc = data_folder + r'\precalc\elevation50.tif'
f_elevation_precalc_100 = data_folder + r'\precalc\elevation.tif'
# Boundary file
f_boundary = data_folder + r'\boundary.gpkg'

In [1]:
# Install dependencies

# API Collection
import requests
from requests.auth import HTTPBasicAuth
# Data manipulation
import pandas as pd
import numpy as np
# Spatial data processing
import pyproj
from pyproj import Transformer
import geopandas as gpd
from shapely.geometry import Point
from shapely.strtree import STRtree
import rasterio
from rasterio.crs import CRS
from rasterio.transform import rowcol
from rasterio.io import MemoryFile
from pykrige.ok import OrdinaryKriging
# Statistics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNetCV
from sklearn.exceptions import ConvergenceWarning
from scipy.stats import directional_stats
from scipy.stats import circmean, linregress
from scipy.spatial import cKDTree
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import gstools as gs
import libpysal.weights
import libpysal
from esda.moran import Moran
# Visualization
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import seaborn as sns
import plotly.graph_objects as go
# Debugging
import logging
from tqdm import tqdm
# Processing / Computation
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
import multiprocessing
from joblib import Parallel, delayed
import pickle
# Core python functionality
import os
import io
import re
from io import StringIO
import glob
import time
from datetime import datetime, timedelta
import math
from collections import defaultdict
import warnings
import csv
import json
import pickle
import argparse
import ast
import threading
import itertools
# Database
import psycopg2
import psycopg2.extras
from psycopg2 import sql
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from psycopg2.extensions import Binary

warnings.filterwarnings("error")
warnings.simplefilter("ignore", ConvergenceWarning)

In [3]:
# Set API credentials
username = 'testUser'
password = 'test-1234-user'

In [None]:
# Set Coordinate Reference Objects

wkt_7801 = """PROJCS["BGS2005 / CCS2005",
GEOGCS["BGS2005",
DATUM["Bulgaria_Geodetic_System_2005",
    SPHEROID["GRS 1980",6378137,298.257222101],
    TOWGS84[0,0,0,0,0,0,0]],
PRIMEM["Greenwich",0,
    AUTHORITY["EPSG","8901"]],
UNIT["degree",0.0174532925199433,
    AUTHORITY["EPSG","9122"]],
AUTHORITY["EPSG","7798"]],
PROJECTION["Lambert_Conformal_Conic_2SP"],
PARAMETER["latitude_of_origin",42.6678756833333],
PARAMETER["central_meridian",25.5],
PARAMETER["standard_parallel_1",42],
PARAMETER["standard_parallel_2",43.3333333333333],
PARAMETER["false_easting",500000],
PARAMETER["false_northing",4725824.3591],
UNIT["metre",1,
AUTHORITY["EPSG","9001"]],
AUTHORITY["EPSG","7801"]]"""
wkt_4326 = """GEOGCS["WGS 84",
    DATUM["WGS_1984",
        SPHEROID["WGS 84",6378137,298.257223563,
            AUTHORITY["EPSG","7030"]],
        AUTHORITY["EPSG","6326"]],
    PRIMEM["Greenwich",0,
        AUTHORITY["EPSG","8901"]],
    UNIT["degree",0.0174532925199433,
        AUTHORITY["EPSG","9122"]],
    AUTHORITY["EPSG","4326"]]"""
project_crs = CRS.from_wkt(wkt_7801)
new_crs = CRS.from_wkt(wkt_4326)
os.environ["GTIFF_SRS_SOURCE"] = "EPSG"
os.environ["PROJ_LIB"] = pyproj.datadir.get_data_dir()

<center>Step 2: Set-up PostGreSQL database and tables<center>

In [16]:
# Set up sql database and tables
 
# Database parameters
dbname='platform_db'
host='localhost'
user='postgres'
password='postgres'
port='5432'
# SQL file to create tables
sql_file = 'setup_tables.sql'

# Function to set up the database
def setup_database(dbname):
    # Connect to server
    conn = psycopg2.connect(host=host, user=user, password=password)
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) # Set isolation level needed for db creation
    cursor = conn.cursor()
    # Create database (check if it exists first)
    cursor.execute("SELECT 1 FROM pg_database WHERE datname = %s", (dbname,))
    exists = cursor.fetchone()
    if not exists:
        cursor.execute(f"CREATE DATABASE {dbname}")
        print(f'Database {dbname} created')
    else:
        print(f"Database {dbname} already exists")
    cursor.close()
    conn.close()

# Function to set up the tables in the db, runs the prepared sql script
def setup_tables(sql_file):
    # Connect to the new db
    conn = psycopg2.connect(dbname=dbname, host=host, user=user, password=password)
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cursor = conn.cursor()
    # Load and execute SQL script
    with open(sql_file, 'r') as f:
        sql_script = f.read()
    try:
        cursor.execute(sql_script)
        print('Tables created')
    except psycopg2.Error as e:
        print(f"Error executing SQL script: {e}")
        # You might want to log the error or handle it more specifically
    finally:
        cursor.close()
        conn.close()

# Set up database
#setup_database(dbname)
# Set up tables
#setup_tables(sql_file)

<center>Step 3: Collect Historical Data</center>

In [7]:
# Collect air pollution station information
# Can merge this with below one after testing

station_url = 'https://citylab.gate-ai.eu/sofiasensors/api/stations/'
# Use requests to request the API
response = requests.get(station_url, auth=HTTPBasicAuth(username,password))
response.raise_for_status()
stations_data = response.json()
# Save the important information to a pandas dataframe
all_stations_df = pd.DataFrame(stations_data, columns=['id','name', 'longitude', 'latitude', 'operator'])
# Clean up the operator names
all_stations_df['operator'] = all_stations_df['operator'].replace('GATE Institute', 'GATE')
all_stations_df['operator'] = all_stations_df['operator'].replace('Executive environmental agency (ExEA)', 'ExEA')
all_stations_df['operator'] = all_stations_df['operator'].replace('Sofia municipality', 'AirThings')
# Convert Lat/Lon values into EPSG7801
transformer = Transformer.from_crs("EPSG:4326", "EPSG:7801", always_xy=True)
def reproject_coords(row):
    longitude, latitude = row['longitude'], row['latitude']
    x, y = transformer.transform(longitude, latitude)
    # Round the coordinates to decimeters
    x, y = round(x, ndigits=1), round(y, ndigits=1)
    return (x,y)
all_stations_df['location'] = all_stations_df.apply(reproject_coords, axis=1)
all_stations_df.drop(['latitude', 'longitude'], axis=1, inplace=True)

print(f'Station information collected. {len(all_stations_df)} total stations')

Station information collected. 39 total stations


In [11]:
# Collect historical hourly measurement data
# takes ~24 min

# Creates an empty dataframe of all of the hours and stations for the year: 
def create_empty_df(station_df, start_date, end_date):
    # Generate hourly timestamps with pandas.daterange
    timestamps = pd.date_range(start=start_date, end=end_date, freq='h')
    # Repeat each timestamp for the amount of stations
    timestamps_repeated = np.repeat(timestamps, len(station_df))
    # Tile the station names and ids over each timestamp
    stations_repeated = np.tile(station_df['name'], len(timestamps))
    ids_repeated = np.tile(station_df['id'], len(timestamps))
    # Construct the DataFrame with organized entries for each hour and station
    empty_df = pd.DataFrame({'time': timestamps_repeated, 'id': ids_repeated, 'name': stations_repeated})
    # Add parameter columns as NaN values to station_data
    for param in total_params:
        empty_df[param] = pd.NA
    # Set a multi-index for efficient updating
    empty_df.set_index(['time','id'], inplace=True)
    print(f'Dataframe for {start_date} to {end_date} created')
    return empty_df   

# Fills the empty dataframe with measurement values from one year
def station_data_collection(station_df, start_date, end_date):
    # Create an empty dataframe to fill in with station data
    # Hours without measurements will not be returned in API
    year_df = create_empty_df(station_df, start_date, end_date)
    # Iterate over each station
    for _,station in station_df.iterrows():
        station_id = station['id']
        station_name = station['name']
        station_operator = station['operator']
        # Skip first two years of collection for GATE sensors
        if (('2020' in start_date) or ('2021' in start_date)) and station_operator == 'GATE':
            continue
        # Set collected parameters based on station to avoid empty queries
        collected_params = [
            param for param in total_params
            if station_id not in missing_params.get(param, [])]
        # Iterate over each parameter to collect one in each request
        for param in collected_params:
            # Reformat space in param string to fit API parameters
            param_str = param.replace(' ','%20')
            # URL for the 'chart' endpoint which collects serial data the quickest
            yearly_url = rf'https://citylab.gate-ai.eu/sofiasensors/api/aggregated/chart/measurements/?station_name={station_name}&parameter_name={param_str}&start_date={start_date}%2019%3A00%3A00&end_date={end_date}%2019%3A00%3A00'
            # Try API request
            try:
                response = requests.get(yearly_url, auth=HTTPBasicAuth(username,password), timeout=30)
                data = response.json()
                response.raise_for_status() # Raises HTTP error that I can catch and interpret
            except requests.exceptions.RequestException as e:
                print(f"Request failed for station {station_name}: {e}")
                continue # If request fails, move on to next parameter
            # If the data is empty, move on to the next parameter
            if not data:
                print(f'No data for {station_name}, {param}')
                continue
            print(f'{len(data)} measured values for {station_name},{param}')
            # Create a df to format the response json with the station name and param
            st_param_df = pd.DataFrame([{'time': list(d.keys())[0], 
                                         'id': station_id,
                                         'name': station_name, 
                                         param : list(d.values())[0]}
                                         for d in data])
            st_param_df['time'] = pd.to_datetime(st_param_df['time'])
            # Set the multi-index for this df to align it with year_df
            st_param_df.set_index(['time','id'], inplace=True)
            # Update the empty df values with the new station and parameter
            year_df.update(st_param_df)
        print(f'{station_name} data collected...')
    # Reset station_data index to restore name and time columns
    year_df.reset_index(inplace=True)
    # Optimize column dtypes for storage
    year_df['name'] = year_df['name'].astype('category')
    year_df['id'] = year_df['id'].astype(pd.Int8Dtype())
    for param in total_params:
        if param in ['Temperature', 'Relative humidity']: # extra conditional due to super high erroneous values that dont fit into float 16
            year_df[param] = pd.to_numeric(year_df[param], errors='coerce').astype(np.float32)
        else:
            year_df[param] = pd.to_numeric(year_df[param], errors='coerce').astype(np.float16)
    return year_df 

# Define parameters to be collected
total_params = ['Ozone','Nitrogen dioxide', 'Sulphur dioxide', 'Particulate matter 10', 'Particulate matter 2.5', 'Temperature', 'Pressure', 'Relative humidity', 'Wind direction', 'Wind speed']
total_params = ['Solar irradiation']
# Based on stations/measure/parameter endpoint of the API
missing_params = {
    'Ozone': [4],
    'Particulate matter 2.5': [1,3,4,5],
    'Pressure':[3,4],
    'Wind direction': [i for i in range(6,28)], # No airthings for wind
    'Wind speed': [i for i in range(6,28)]
}

all_years_dfs = []
# Collect from when all stations are operational
for year in range(2022,2025):
    start_date = f'{year}-09-01'
    # Set the end date a year after unless it's this year
    if year < 2024:
        end_date = f'{year+1}-09-01'
    else:
        end_date = '2025-06-01'
    # Run yearly collection function for this particular year
    # One year at a time due to memory and HTTP request size constraints
    year_df = station_data_collection(station_info, start_date, end_date)
    all_years_dfs.append(year_df)

# Concatenate all year dfs into one total dataframe
final_df = pd.concat(all_years_dfs, ignore_index=True)
final_df.sort_values(by=['time', 'id'], inplace=True)
final_df.info(memory_usage='deep')

# Save to csv for now
final_df.to_csv(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\solar.csv')

Dataframe for 2022-09-01 to 2023-09-01 created
8761 measured values for AE1,Solar irradiation
AE1 data collected...
8761 measured values for AE2,Solar irradiation
AE2 data collected...
8761 measured values for AE3,Solar irradiation
AE3 data collected...
8569 measured values for AE4,Solar irradiation
AE4 data collected...
8737 measured values for AE5,Solar irradiation
AE5 data collected...
No data for AT2, Solar irradiation
AT2 data collected...
No data for AT4, Solar irradiation
AT4 data collected...
No data for AT6, Solar irradiation
AT6 data collected...
No data for AT7, Solar irradiation
AT7 data collected...
No data for AT8, Solar irradiation
AT8 data collected...
No data for AT9, Solar irradiation
AT9 data collected...
No data for AT10, Solar irradiation
AT10 data collected...
No data for AT12, Solar irradiation
AT12 data collected...
No data for AT13, Solar irradiation
AT13 data collected...
No data for AT14, Solar irradiation
AT14 data collected...
No data for AT16, Solar irradi

In [4]:
# Clean sensor data

# Don't need for final version
total_params = ['Ozone','Nitrogen dioxide', 'Sulphur dioxide', 'Particulate matter 10', 'Particulate matter 2.5', 'Temperature', 'Pressure', 'Relative humidity', 'Wind direction', 'Wind speed']

# Function to set unrealistic values and outliers to nan values
def nan_erroneous(raw_data):
    cleaned_data = raw_data.copy()
    cleaned_data = cleaned_data.sort_values(by=['id', 'time'])
    # First remove impossible values and error values found by examining the data
    cleaned_data.loc[cleaned_data['Temperature'] > 50, 'Temperature'] = np.nan
    cleaned_data.loc[cleaned_data['Relative humidity'] > 105, 'Relative humidity'] = np.nan
    cols_to_check_positive = ['Ozone', 'Nitrogen dioxide', 'Sulphur dioxide', 
        'Particulate matter 10', 'Particulate matter 2.5', 
        'Pressure', 'Relative humidity']
    for col in cols_to_check_positive:
        cleaned_data.loc[cleaned_data[col] <= 0, col] = np.nan
    # Remove O3 data before February 2024 #TODO: Add visualizer to the notebook
    cutoff_date = pd.to_datetime('2024-02-01')
    cleaned_data.loc[cleaned_data['time'] < cutoff_date, 'Ozone'] = np.nan
    # Use rolling average 3 st.dev to find strong outliers
    cleaned_data = cleaned_data.set_index('time')
    for col in total_params:
        mean = cleaned_data.groupby('id')[col].rolling(window='7D', min_periods=1).mean()
        std = cleaned_data.groupby('id')[col].rolling(window='7D', min_periods=1).std()
        # Drop id index level to align to original index
        mean = mean.reset_index(level=0, drop=True)
        std = std.reset_index(level=0, drop=True)
        upper_bound = mean + (3 * std)
        lower_bound = mean - (3 * std)
        cleaned_data[col] = cleaned_data[col].where((cleaned_data[col] <= upper_bound) & (cleaned_data[col] >= lower_bound))
    cleaned_data = cleaned_data.sort_values(by=['time', 'id'])
    cleaned_data = cleaned_data.reset_index()
    return cleaned_data

cleaned_data = nan_erroneous(raw_data)

In [None]:
# Remove invalid stations
# Plot overall data completeness of stations per month

total_params = ['Ozone','Nitrogen dioxide', 'Sulphur dioxide', 'Particulate matter 10', 'Particulate matter 2.5', 'Temperature', 'Pressure', 'Relative humidity', 'Wind direction', 'Wind speed']

# Can remove
missing_params = {
    'Ozone': [4],
    'Particulate matter 2.5': [1,3,4,5],
    'Pressure':[3,4],
    'Wind direction': [i for i in range(6,28)], # No airthings for wind
    'Wind speed': [i for i in range(6,28)]
}

def plot_completeness(cleaned_data):
    df_processed = cleaned_data.copy()
    unique_stations = all_stations_df[['name', 'id']]

    # Collect overall completeness of each station
    station_overall_completeness = {}
    for _, station_row in unique_stations.iterrows():
        station_name = station_row['name']
        station_id = station_row['id']
        # Determine the parameters this station collects
        params_collected = [p for p in total_params 
                            if p not in missing_params or station_id not in missing_params.get(p, [])]
        # Filter the df for only the current station's data
        station_data = df_processed[df_processed['name'] == station_name]
        total_expected = len(station_data) * len(params_collected)
        # Calculate the total number of non-null measurements
        total_actual = station_data[params_collected].notnull().sum().sum()
        completeness_pct = (total_actual / total_expected) * 100
        station_overall_completeness[station_name] = completeness_pct
    # Convert to a Series and sort it to get the ranked order
    ranked_stations = pd.Series(station_overall_completeness).sort_values()

    # Calculate the monthly percentages of non-null values
    station_expected_counts = {}
    total_param_count = len(total_params)
    for _, station_row in unique_stations.iterrows():
        num_missing = sum(1 for param, missing_ids in missing_params.items() 
                          if param in total_params and station_row['id'] in missing_ids)
        station_expected_counts[station_row['name']] = total_param_count - num_missing
    df_processed['expected_param_count'] = df_processed['name'].map(station_expected_counts)
    df_processed['expected_param_count'] = df_processed['expected_param_count'].fillna(total_param_count)
    non_null_actuals = df_processed[total_params].notnull().sum(axis=1)
    df_processed['non_null_percentage'] = 100 * (non_null_actuals / df_processed['expected_param_count']).replace([np.inf, -np.inf], 0)
    df_processed['month_year'] = df_processed['time'].dt.to_period('M')
    monthly_station_completeness = df_processed.groupby(['month_year', 'name'], observed=True)['non_null_percentage'].mean().reset_index()
    pivot_df = monthly_station_completeness.pivot(index='month_year', columns='name', values='non_null_percentage')
    pivot_df.index = pivot_df.index.astype(str)
    monthly_median = pivot_df.median(axis=1)
    monthly_mean = pivot_df.mean(axis=1)

    # Plot figure
    fig = go.Figure()
    # Add mean and median lines
    fig.add_trace(go.Scatter(
        x=monthly_mean.index, y=monthly_mean.values, name='Mean (All Stations)',
        mode='lines', line=dict(color='firebrick', width=3, dash='dash'), hoverinfo='x+y'
    ))
    fig.add_trace(go.Scatter(
        x=monthly_median.index, y=monthly_median.values, name='Median (All Stations)',
        mode='lines', line=dict(color='black', width=4), hoverinfo='x+y'
    ))
    # Add individual station lines IN RANKED ORDER
    for station_name, overall_pct in ranked_stations.items():
        if station_name in pivot_df.columns:
            fig.add_trace(go.Scatter(
                x=pivot_df.index, y=pivot_df[station_name],
                name=f"{station_name} ({overall_pct:.1f}%)", # New ranked & labeled name
                mode='lines', line=dict(width=1.5),
                visible='legendonly', # Hidden by default
                hoverinfo='x+y+name'
            ))
    # Customize layout
    fig.update_layout(
        title_text='Interactive Monthly Data Completeness',
        xaxis_title='Month-Year',
        yaxis_title='Non-Null Percentage (%)',
        yaxis_range=[-5, 105],
        legend_title_text='Stations (Ranked by Overall Completeness)',
        hovermode='x unified',
        template='plotly_white',
        xaxis=dict(
            dtick="M1",  
            tickformat="%b\n%Y",  
            tickangle=0  
        )
    )
    fig.show()

# Remove identified stations from cleaned_data
invalid_stations = [6,8,10,16,20,32,36,38]
#old_invalid_stations = [7,9,19,27,32,36,38]
mask = ~cleaned_data['id'].isin(invalid_stations)
processed_data = cleaned_data[mask].copy()


plot_completeness(processed_data)



# Save to csv
processed_data.to_csv(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\processed_data.csv', index=False)

In [8]:
# Plot completeness by parameter
def plot_completeness_by_parameter(cleaned_data):
    """
    Plots the monthly data completeness for each station for a selected parameter.

    Args:
        cleaned_data (pd.DataFrame): DataFrame containing the time series data with a 'time' column,
                                     a 'name' column for station names, and columns for each parameter.
    """
    
    df_processed = cleaned_data.copy()
    df_processed['month_year'] = df_processed['time'].dt.to_period('M')
    
    total_params = ['Ozone','Nitrogen dioxide', 'Sulphur dioxide', 'Particulate matter 10', 'Particulate matter 2.5', 'Temperature', 'Pressure', 'Relative humidity', 'Wind direction', 'Wind speed']
    unique_stations = df_processed['name'].unique()
    
    fig = go.Figure()

    num_traces_per_param = len(unique_stations) + 2  # Stations + Mean + Median

    for i, param in enumerate(total_params):
        # Calculate monthly completeness for the parameter
        monthly_completeness = df_processed.groupby(['month_year', 'name'], observed=False)[param].apply(lambda x: x.notnull().mean() * 100).reset_index()
        pivot_df = monthly_completeness.pivot(index='month_year', columns='name', values=param)
        pivot_df = pivot_df.reindex(columns=unique_stations, fill_value=0) # Ensure all stations are included
        pivot_df.index = pivot_df.index.astype(str)

        # Rank stations by overall completeness for the parameter
        station_overall_completeness = pivot_df.mean().sort_values()

        # Add traces for each station, ranked
        for station_name in station_overall_completeness.index:
            completeness_pct = station_overall_completeness[station_name]
            fig.add_trace(go.Scatter(
                x=pivot_df.index,
                y=pivot_df[station_name],
                name=f"{station_name} ({completeness_pct:.1f}%)",
                mode='lines',
                visible=(i == 0),
                line=dict(width=1.5),
                hoverinfo='x+y+name',
                showlegend=True
            ))
            
        # Calculate and add mean and median traces
        monthly_mean = pivot_df.mean(axis=1)
        monthly_median = pivot_df.median(axis=1)
        
        fig.add_trace(go.Scatter(
            x=monthly_mean.index, y=monthly_mean.values, name='Mean (All Stations)',
            mode='lines', line=dict(color='firebrick', width=3, dash='dash'),
            visible=(i == 0), hoverinfo='x+y'
        ))
        fig.add_trace(go.Scatter(
            x=monthly_median.index, y=monthly_median.values, name='Median (All Stations)',
            mode='lines', line=dict(color='black', width=4),
            visible=(i == 0), hoverinfo='x+y'
        ))

    # Create dropdown menu
    buttons = []
    for i, param in enumerate(total_params):
        visibility = [False] * (len(total_params) * num_traces_per_param)
        start_index = i * num_traces_per_param
        end_index = start_index + num_traces_per_param
        for j in range(start_index, end_index):
            visibility[j] = True
            
        button = dict(
            label=param,
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Monthly Data Completeness for {param}'}]
        )
        buttons.append(button)

    fig.update_layout(
        updatemenus=[dict(
            active=0,
            buttons=buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.01,
            xanchor="left",
            y=1.15,
            yanchor="top"
        )],
        title_text=f'Monthly Data Completeness for {total_params[0]}',
        xaxis_title='Month-Year',
        yaxis_title='Completeness (%)',
        yaxis_range=[-5, 105],
        legend_title_text='Stations (Ranked by Completeness)',
        hovermode='x unified',
        template='plotly_white',
        xaxis=dict(
            dtick="M1",
            tickformat="%b\n%Y",
            tickangle=0
        )
    )

    fig.show()

# Remove identified stations from cleaned_data
invalid_stations = [6,8,10,16,20,32,36,38]
#old_invalid_stations = [7,9,19,27,32,36,38]
mask = ~cleaned_data['id'].isin(invalid_stations)
processed_data = cleaned_data[mask].copy()

plot_completeness_by_parameter(processed_data)

In [10]:
# Prepare and upload processed data to sensor_data db

# Rename measurements for ease and matching db schema
processed_data.rename({
    'time':'measured_time',
    'id':'station_id',
    'Ozone':'O3',
    'Nitrogen dioxide':'NO2',
    'Sulphur dioxide':'SO2',
    'Particulate matter 10':'PM10',
    'Particulate matter 2.5':'PM25',
    'Temperature':'T',
    'Pressure':'P',
    'Relative humidity':'RH',
    'Wind direction':'WD',
    'Wind speed':'WS'
}, axis=1, inplace=True)
total_params = ['O3','NO2','SO2','PM10','PM25','T','P','RH','WD','WS']
# Remove station name column as its uncessary
f_processed_data = processed_data.drop('name', axis='columns')
# Add timezone to measured_time to match schema
f_processed_data['measured_time'] = f_processed_data['measured_time'].dt.tz_localize('UTC')

# Transform dataframe to 'long' format turning measurement columns to rows
df_long = f_processed_data.melt(
    id_vars=['measured_time', 'station_id'],
    value_vars=total_params,
    var_name='measurement',
    value_name='reading_value'
)
# Change measurement to categorical to save some memory
df_long['measurement'] = pd.Categorical(df_long['measurement'], categories=total_params, ordered=False)

# Remove all nan values
df_long.dropna(subset=['reading_value'], inplace=True)
df_long.reset_index(drop=True, inplace=True)
# Reorder columns for schema
df_final = df_long[['measured_time','station_id','measurement','reading_value']]
print('Final Dataframe Created')
print(df_final.info(memory_usage='deep'))
df_final.to_csv(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\long_sensor_measurements3.csv', index=False)

def upload_df_to_db(df_final):
    # Copy dataframe to database
    conn = None
    try:
        conn = psycopg2.connect(
            dbname=dbname,
            user=user,
            password=password,
            host=host,
            port=port
        )
        cursor = conn.cursor()
        print('Connected to Database')
        print('Copying df to database expect ~1 min of processsing')

        # Create in-memory buffer
        buffer = io.StringIO()
        # Write df to the buffer as csv
        df_final.to_csv(buffer, index=False, header=False)
        buffer.seek(0)
        sql_copy_command = """
            COPY sensor_data(measured_time, station_id, measurement, reading_value)
            FROM STDIN
            WITH (FORMAT CSV)
        """
        # Execute command
        cursor.copy_expert(sql=sql_copy_command, file=buffer)
        conn.commit()
        print(f"{cursor.rowcount} rows inserted successfully.")
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error: {error}")
        if conn:
            conn.rollback()
    finally:
        if conn:
            cursor.close()
            conn.close()
            print("Database connection closed.")

# Run function to upload to database
#upload_df_to_db(df_final=df_final)

Final Dataframe Created
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4876094 entries, 0 to 4876093
Data columns (total 4 columns):
 #   Column         Dtype              
---  ------         -----              
 0   measured_time  datetime64[ns, UTC]
 1   station_id     Int8               
 2   measurement    category           
 3   reading_value  float32            
dtypes: Int8(1), category(1), datetime64[ns, UTC](1), float32(1)
memory usage: 69.8 MB
None


<center>Step 4: Calculate Spatial Independant Variables<center>

In [14]:
# Calculate all variable values

# Function to create all masks needed for processing
def create_masks(mask_config):
    masks = {}
    directions = {'E': 0, 'NE': 45, 'N': 90, 'NW': 135,'W': 180, 'SW': 225, 'S': 270, 'SE': 315}
    sector_half_angle_rad = np.deg2rad(22.5)

    for res, shapes in mask_config.items():
        masks[res] = {'circle':{}, 'sector':{}}
        # Create circular masks
        circle_radii = shapes.get('circle', [])
        for buffer_size in circle_radii:
            radius_px = int(np.round(buffer_size / res))
            size = 2 * radius_px + 1
            center = radius_px
            y, x = np.ogrid[:size, :size]
            # Distance from the center of the mask to the center of each pixel
            distance_squared = (x - center)**2 + (y - center)**2
            masks[res]['circle'][buffer_size] = distance_squared <= radius_px**2
        # Create 8 Sector Masks
        sector_radii = shapes.get('sector', [])
        for buffer_size in sector_radii:
            masks[res]['sector'][buffer_size] = {}
            radius_px = int(np.round(buffer_size / res))
            size = 2 * radius_px + 1
            center = radius_px
            y, x = np.ogrid[:size, :size]
            distance_squared = (x - center)**2 + (y - center)**2
            radius_check = distance_squared <= radius_px**2
            # Angle of pixels center relative to mask's center
            pixel_angles_rad = np.arctan2(center - y, x - center)
            for dir_name, dir_angle_deg in directions.items():
                dir_angle_rad = np.deg2rad(dir_angle_deg)
                relative_angles = np.arctan2(np.sin(pixel_angles_rad - dir_angle_rad), 
                                             np.cos(pixel_angles_rad - dir_angle_rad))
                angle_check = np.abs(relative_angles) <= sector_half_angle_rad
                # Final mask is where both radius and angle conditions are met
                final_mask = radius_check & angle_check
                masks[res]['sector'][buffer_size][dir_name] = final_mask
    return masks

# Function to get the index of a raster for a point
def get_cell_indices(point, raster_info): 
    xmin = raster_info['xmin']
    ymax = raster_info['ymax']
    resolution = raster_info['resolution']
    pointx, pointy = point.x, point.y
    col = ((pointx - xmin) / resolution)
    row = ((ymax - pointy) / resolution)
    return round(row), round(col)

# Function which collects cells from the mask
def get_mask_cells(point_index, array, mask):
    rows, cols = array.shape
    point_r, point_c = point_index
    mask_size = mask.shape[0]
    radius = mask_size // 2
    # Calculate bounds for the slice in the main array
    row_start = max(0, point_r - radius)
    row_end = min(rows, point_r + radius + 1)
    col_start = max(0, point_c - radius)
    col_end = min(cols, point_c + radius + 1)
    # Bounds for the mask slice to align with array slice
    mask_row_start = max(0, radius - point_r)
    mask_row_end = mask_size - max(0, (point_r + radius + 1) - rows)
    mask_col_start = max(0, radius - point_c)
    mask_col_end = mask_size - max(0, (point_c + radius + 1) - cols)
    # Extract the slices
    array_slice = array[row_start:row_end, col_start:col_end]
    mask_slice = mask[mask_row_start:mask_row_end, mask_col_start:mask_col_end]
    return array_slice[mask_slice]

# Function to calculate the value of a buffer
def calculate_buffer_value(point, var, raster_info, mask):
    # Convert point to cell coordinate in raster
    point_index = get_cell_indices(point, raster_info)
    # Extract the cells in 1d array 
    buffer_cells = get_mask_cells(point_index, raster_info['array'], mask)
    # Filter nodata values
    nodata_val = raster_info['nodata']
    valid_cells = buffer_cells[buffer_cells != nodata_val]
    
    # Calculation functions depending on variable
    def calculate_land_use(land_use_code):
        return np.isin(valid_cells, land_use_code).sum()
    def calculate_build_std(): # Standard deviation of building heights
        if len(valid_cells) >= 2:
            return np.std(valid_cells)
        else:
            return 0.0
    
    # Dictionary mapping variables to their calculation functions
    calculation_map = {
        'lu_hdr': lambda: calculate_land_use(1),
        'lu_ldr': lambda: calculate_land_use(2),
        'lu_ind': lambda: calculate_land_use(3),
        'lu_ug': lambda: calculate_land_use(4),
        'lu_art': lambda: calculate_land_use(5),
        'lu_for': lambda: calculate_land_use(6),
        'lu_rur': lambda: calculate_land_use(7),
        'build_fp': lambda: len(valid_cells),
        'build_vol': lambda: np.sum(valid_cells),
        'build_var': calculate_build_std,
        'ndvi': lambda: float(np.mean(valid_cells)),
        'pop': lambda: float(np.mean(valid_cells)),
        'rd_mt': lambda: np.count_nonzero(valid_cells),
        'rd_prim': lambda: np.count_nonzero(valid_cells),
        'rd_sec': lambda: np.count_nonzero(valid_cells),
        'rd_ter': lambda: np.count_nonzero(valid_cells),
        'rd_res': lambda: np.count_nonzero(valid_cells)
    }

    # Return the value from the function based on the variable
    return calculation_map[var]()

# Function to collect value of the cell point
def cell_value(point, raster_info):
    # Convert point to cell coordinate in raster
    i,j = get_cell_indices(point, raster_info)
    # Extract cell value
    return raster_info['array'][i,j]

# Function to do distance calculations (for major road and busstop)
def distance_calculations(point, tree, geoms):
    # Find nearest linestring in the tree
    nearest_index = tree.nearest(point)
    nearest_geom = geoms[nearest_index]
    return point.distance(nearest_geom)

# Function to parallel compute the function with input points 
def parallel_compute(func, input_points):
    with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
        results = list(executor.map(func, input_points))
    return results

# Main calculation function
def calculate_points(var_info, points, success_list, skip_config=None):
    # Initialize list to store variable calculations
    var_values = []
    # Define variables from input calculation
    var_name = var_info['name']
    data_file = var_info['file']
    calc_type = var_info['calculation']
    # Check if the whole variable should be skipped
    if skip_config and var_name in skip_config and skip_config[var_name].get('all'):
        print(f'{var_name} skipped')
        return var_values
    # Process most raster data
    if calc_type == 'buffer':
        with rasterio.open(data_file) as src:
            # Save important info of the raster to dictionary
            raster_info = {'name': var_name,
                           'array':src.read(1),
                           'resolution':src.res[0],
                           'xmin':src.bounds.left,
                           'ymax':src.bounds.top,
                           'nodata':src.nodata}
            # Get masks for the raster resolution
            res_masks = masks.get(raster_info['resolution'])
            # Iterate over circle masks
            for radius, mask in res_masks.get('circle', {}).items():
                # Check skip_config before calculating
                if skip_config and var_name in skip_config and 'circle' in skip_config.get(var_name, {}) and radius in skip_config[var_name]['circle']:
                    print(f'{var_name} circle {radius}m skipped')
                    continue
                calc_name = f'{var_name}_{radius}m_cir'
                # Define the partial function pre-filled with arguments
                func = partial(calculate_buffer_value, var=var_name, raster_info=raster_info, mask=mask)
                try: # Catch errors if they happen and add to a success list to not calculate again
                    start_time = time.time()
                    # Calculate and save the values of all of the points calculated for the buffer value
                    var_values.append(pd.Series(parallel_compute(func,points), name=calc_name))
                    end_time = time.time()
                    duration = end_time - start_time
                    print(f'{calc_name} calculated in {duration:.2f} seconds')
                    success_list.append(calc_name)
                except Exception as e:
                    print(f'Calculation error: {e}')
            # Iterate over sector masks
            for radius, dir_masks in res_masks.get('sector', {}).items():
                # Check skip_config before calculating
                if skip_config and var_name in skip_config and 'sector' in skip_config.get(var_name, {}) and radius in skip_config[var_name]['sector']:
                    print(f'{var_name} sector {radius}m skipped')
                    continue
                for direction, mask in dir_masks.items():
                    calc_name = f'{var_name}_{radius}m_{direction}'
                    func = partial(calculate_buffer_value, var=var_name, raster_info=raster_info, mask=mask)
                    try:
                        start_time = time.time()
                        # Save the values
                        var_values.append(pd.Series(parallel_compute(func,points), name=calc_name))
                        end_time = time.time()
                        duration = end_time - start_time
                        print(f'{calc_name} calculated in {duration:.2f} seconds')
                        success_list.append(calc_name)
                    except Exception as e:
                        print(f'Calculation error: {e}')
    # Process elevation and other value
    elif calc_type == 'value':
        with rasterio.open(data_file) as src:
            # Save important info of the raster to dictionary
            raster_info = {'name': var_name,
                           'array':src.read(1),
                           'resolution':src.res[0],
                           'xmin':src.bounds.left,
                           'ymax':src.bounds.top,
                           'nodata':src.nodata}
            func = partial(cell_value, raster_info=raster_info)
            start_time = time.time()
            # Calculate and save values    
            var_values.append(pd.Series(parallel_compute(func,points), name=var_name))
            end_time = time.time()
            duration = end_time - start_time
            print(f'{var_name} calculated in {duration:.2f} seconds')
    # Process distance variables
    else:
        vector = gpd.read_file(data_file)
        # Create STRtree spatial index from the gdf
        geoms = vector.geometry.values
        tree = STRtree(geoms)
        # Set the name and parallel compute the distance calculations
        func = partial(distance_calculations, tree=tree, geoms=geoms)
        start_time = time.time()
        # Calculate and save values    
        var_values.append(pd.Series(parallel_compute(func,points), name=var_name))
        end_time = time.time()
        duration = end_time - start_time
        print(f'{var_name} calculated in {duration:.2f} seconds')
    return var_values

buffer_sizes = [25,100,500,1500,4000]
indep_vars = [
    {'name':'lu_hdr','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_ldr','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_ind','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_ug','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_art','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_for','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_rur','file':f_landuse,'calculation':'buffer'},
    {'name':'build_fp','file':f_buildheight,'calculation':'buffer'},
    {'name':'build_vol','file':f_buildheight,'calculation':'buffer'},
    {'name':'build_var','file':f_buildheight,'calculation':'buffer'},
    {'name':'ndvi','file':f_ndvi,'calculation':'buffer'},
    {'name':'pop','file':f_pop,'calculation':'buffer'},
    {'name':'rd_mt','file':f_motor_trunk,'calculation':'buffer'},
    {'name':'rd_prim','file':f_primary,'calculation':'buffer'},
    {'name':'rd_res','file':f_residential,'calculation':'buffer'},
    {'name':'rd_sec','file':f_secondary,'calculation':'buffer'},
    {'name':'rd_ter','file':f_tertiary,'calculation':'buffer'},
    {'name':'elevation','file':f_elevation,'calculation':'value'},
    {'name':'rd_dis','file':f_mtp,'calculation':'vector_dis'},
    {'name':'bus_dis','file':f_busstop,'calculation':'vector_dis'}
]

# First pre-calculate the masks to be used for buffer sector extraction
mask_config = {10: {'circle':[25,100,500,1500,4000], 
                    'sector':[100,500,1500,4000]},
            100: {'circle':[25,100,500,1500,4000],
                'sector':[500,1500,4000] }}
masks = create_masks(mask_config)
print('Buffer masks created')

success_list = []
def calculate_ivs():
    # Calculate independant variable station values
    station_points = [Point(loc) for loc in station_info['location']]
    station_ids = [id for id in station_info['id']]
    # Initialize df to store all iv values
    iv_values = [pd.Series(station_ids, name='id')]
    for indep_var in indep_vars:
        print(f'Calculating {indep_var["name"]}...')
        # Calculate list of dicts of all calculations and corresponding values
        list_of_calc_series = calculate_points(indep_var, station_points, success_list)
        # Iterate over calculations and add to df
        iv_values.extend(list_of_calc_series)
    iv_values_df = pd.concat(iv_values, axis=1)
    return iv_values_df

iv_values_df = calculate_ivs()
iv_values_df.to_csv(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\iv_values3.csv', index=False)

Buffer masks created
Calculating lu_hdr...
lu_hdr_25m_cir calculated in 0.01 seconds
lu_hdr_100m_cir calculated in 0.01 seconds
lu_hdr_500m_cir calculated in 0.01 seconds
lu_hdr_1500m_cir calculated in 0.01 seconds
lu_hdr_4000m_cir calculated in 0.03 seconds
lu_hdr_100m_E calculated in 0.01 seconds
lu_hdr_100m_NE calculated in 0.01 seconds
lu_hdr_100m_N calculated in 0.01 seconds
lu_hdr_100m_NW calculated in 0.01 seconds
lu_hdr_100m_W calculated in 0.01 seconds
lu_hdr_100m_SW calculated in 0.01 seconds
lu_hdr_100m_S calculated in 0.01 seconds
lu_hdr_100m_SE calculated in 0.01 seconds
lu_hdr_500m_E calculated in 0.01 seconds
lu_hdr_500m_NE calculated in 0.01 seconds
lu_hdr_500m_N calculated in 0.01 seconds
lu_hdr_500m_NW calculated in 0.01 seconds
lu_hdr_500m_W calculated in 0.01 seconds
lu_hdr_500m_SW calculated in 0.01 seconds
lu_hdr_500m_S calculated in 0.01 seconds
lu_hdr_500m_SE calculated in 0.01 seconds
lu_hdr_1500m_E calculated in 0.01 seconds
lu_hdr_1500m_NE calculated in 0.01 

In [None]:
# Process variable values (clean + normalize)
iv_values_df = pd.read_csv(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\iv_values3.csv')

# Filter out variables which are the mostly the same across all stations
def clean_consistent_variables(df, threshold):
    cols_to_remove = set()
    directional_suffixes = ['_E', '_NE', '_N', '_NW', '_W', '_SW', '_S', '_SE']
    
    directional_groups = {}
    non_directional_cols = []

    for col in df.columns:
        is_directional = False
        # Treat columns with '_cir' as non-directional.
        if '_cir' in col:
            non_directional_cols.append(col)
            continue

        for suffix in directional_suffixes:
            if col.endswith(suffix):
                base_name = col[:-len(suffix)]
                if base_name not in directional_groups:
                    directional_groups[base_name] = []
                directional_groups[base_name].append(col)
                is_directional = True
                break
        
        if not is_directional:
            non_directional_cols.append(col)

    # A group is removed only if ALL its columns meet the 80% criteria.
    for base_name, group_cols in directional_groups.items():
        all_in_group_meet_criteria = True
        for col in group_cols:
            if df[col].value_counts(normalize=True).max() < threshold:
                all_in_group_meet_criteria = False
                break
        # If all columns met the criteria, add the entire group to the removal list.
        if all_in_group_meet_criteria:
            cols_to_remove.update(group_cols)

    # Check circle buffers
    for col in non_directional_cols:
        if df[col].value_counts(normalize=True).max() >= threshold:
            cols_to_remove.add(col)

    # Remove the identified columns
    df_cleaned = df.drop(columns=list(cols_to_remove))
    return df_cleaned, sorted(list(cols_to_remove))

df_cleaned, removed_vars = clean_consistent_variables(iv_values_df, 0.8)
print(f'Removed {(len(iv_values_df.columns) - len(df_cleaned.columns))}/{len(iv_values_df.columns)} columns: {len(df_cleaned.columns)} remaining')


# Standardize variables and save information (OLD)
def standardize_variables(df):
    df_transformed = df.copy()
    variable_info_dict = {}
    for col in df.columns:
        # Skip id
        if col == 'id':
            continue
        # Parse column name
        parts = col.split('_')
        radius = None
        buffer_type = None
        base_name = None
        # Check if the name likely follows the convention
        if len(parts) >= 3 and parts[-2].endswith('m'):
            try:
                radius = int(parts[-2][:-1])
                buffer_type = parts[-1]
                base_name = '_'.join(parts[:-2])
            except ValueError:
                base_name = col 
                pass
        else:
            base_name = col

        log_transformed_col = np.log1p(df_transformed[col])
        mean_val = log_transformed_col.mean()
        std_val = log_transformed_col.std()
        # Standardize log-transform
        if std_val > 0:
            df_transformed[col] = (log_transformed_col - mean_val) / std_val
        else:
            # If std is 0, the standardized value is 0
            df_transformed[col] = 0

        variable_info_dict[col] = {
        'base_var_name': base_name,
        'radius': radius,
        'type': buffer_type,
        'mean': mean_val,
        'st_dev': std_val
        }
    return df_transformed, variable_info_dict


def standardize_variables(df, skew_threshold=0.75):
    df_transformed = df.copy()
    variable_info_dict = {}

    # --- Step 1: Identify which columns to log transform based on skewness ---
    # Calculate skewness only for numeric columns, excluding any ID columns
    numeric_cols = df.select_dtypes(include=np.number).columns.drop('id', errors='ignore')
    skewness = df[numeric_cols].skew()
    right_skewed_cols = skewness[skewness > skew_threshold].index.tolist()

    print(f"Found {len(right_skewed_cols)} / {len(numeric_cols)} columns to log transform based on skewness > {skew_threshold}")

    # --- Step 2: Iterate through columns to process and standardize them ---
    for col in df.columns:
        if col == 'id':
            continue

        # This is the data we will ultimately standardize
        data_to_process = df_transformed[col]
        was_log_transformed = False

        # --- Step 3: Selectively apply log transform ---
        if col in right_skewed_cols:
            data_to_process = np.log1p(data_to_process)
            was_log_transformed = True

        # --- Step 4: Calculate mean and std dev for standardization ---
        # These are calculated on the (potentially) log-transformed data
        mean_val = data_to_process.mean()
        std_val = data_to_process.std()

        # --- Step 5: Standardize the column ---
        if std_val > 0:
            df_transformed[col] = (data_to_process - mean_val) / std_val
        else:
            df_transformed[col] = 0 # Handle zero-variance columns

        # --- Step 6: Save the "recipe" for this variable ---
        # Your column name parsing logic is preserved here
        parts = col.split('_')
        radius, buffer_type, base_name = None, None, col
        if len(parts) >= 3 and parts[-2].endswith('m'):
            try:
                radius = int(parts[-2][:-1])
                buffer_type = parts[-1]
                base_name = '_'.join(parts[:-2])
            except ValueError:
                pass # base_name is already set to col

        variable_info_dict[col] = {
            'base_var_name': base_name,
            'radius': radius,
            'type': buffer_type,
            'mean': mean_val,        # The mean used for standardization
            'st_dev': std_val,      # The std dev used for standardization
            'was_log_transformed': was_log_transformed # NEW: Flag for log transform
        }

    return df_transformed, variable_info_dict

st_ivs, iv_info_dict = standardize_variables(df_cleaned)
st_ivs.to_csv(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\iv_values_process4.csv', index=False)
output_json_path = r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\iv_info.json'
# Store info dict in json
try:
    with open(output_json_path, 'w') as f:
        json.dump(iv_info_dict, f, indent=4)
    print(f"Variable transformation info successfully saved to {output_json_path}")
except Exception as e:
    print(f"Error saving JSON file: {e}")

# Store info of removed variables into dict
def create_skip_config(removed_vars):
    skip_config = {}
    directional_suffixes = ['E', 'NE', 'N', 'NW', 'W', 'SW', 'S', 'SE']

    for var in removed_vars:
        parts = var.split('_')
        base_name = var
        buffer_type = None
        radius = None

        # Check for directional or circular buffer patterns
        if len(parts) > 2 and parts[-2].endswith('m'):
            try:
                radius = int(parts[-2][:-1])
                if parts[-1] in directional_suffixes:
                    buffer_type = 'sector'
                    base_name = '_'.join(parts[:-2])
                elif parts[-1] == 'cir':
                    buffer_type = 'circle'
                    base_name = '_'.join(parts[:-2])
            except ValueError:
                # Not a valid buffer format, treat as a single variable name
                pass

        if base_name not in skip_config:
            skip_config[base_name] = {}

        if buffer_type and radius is not None:
            if buffer_type not in skip_config[base_name]:
                skip_config[base_name][buffer_type] = []
            if radius not in skip_config[base_name][buffer_type]:
                skip_config[base_name][buffer_type].append(radius)
        else:
            # If no buffer type, it's a non-buffered var or a group to be skipped
            skip_config[base_name]['all'] = True      
    print("made skip config")
    return skip_config

skip_config = create_skip_config(removed_vars)



Removed 152/625 columns: 473 remaining
Found 219 / 472 columns to log transform based on skewness > 0.75
Variable transformation info successfully saved to C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\iv_info.json
made skip config


In [None]:
# Create precalculated independant variable maps

# Size
indep_vars = [
    {'name':'lu_hdr','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_ldr','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_ind','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_ug','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_art','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_for','file':f_landuse,'calculation':'buffer'},
    {'name':'lu_rur','file':f_landuse,'calculation':'buffer'},
    {'name':'build_fp','file':f_buildheight,'calculation':'buffer'},
    {'name':'build_vol','file':f_buildheight,'calculation':'buffer'},
    {'name':'build_var','file':f_buildheight,'calculation':'buffer'},
    {'name':'ndvi','file':f_ndvi,'calculation':'buffer'},
    {'name':'pop','file':f_pop,'calculation':'buffer'},
    {'name':'rd_mt','file':f_motor_trunk,'calculation':'buffer'},
    {'name':'rd_prim','file':f_primary,'calculation':'buffer'},
    {'name':'rd_res','file':f_residential,'calculation':'buffer'},
    {'name':'rd_sec','file':f_secondary,'calculation':'buffer'},
    {'name':'rd_ter','file':f_tertiary,'calculation':'buffer'},
    {'name':'elevation','file':f_elevation,'calculation':'value'},
    {'name':'rd_dis','file':f_mtp,'calculation':'vector_dis'},
    {'name':'bus_dis','file':f_busstop,'calculation':'vector_dis'}
]

# Create the input points array of the map within the boundary
def create_input_points():
    # Read a precalculated raster to serve as format
    with rasterio.open(f_elevation_precalc) as src:
        # Read relevant data for raster
        raster_data = src.read(1)
        nodata_val = src.nodata
        transform = src.transform
        shape = src.shape
        crs = src.crs
        # Get indices of non-NaN values
        non_nan_indices = np.where(raster_data != nodata_val)
        rows, cols = non_nan_indices
        # Get coordinates using vectorized operation
        xs, ys = rasterio.transform.xy(transform, rows, cols)
        # Save raster metadata for writing
        raster_metadata = {
            "shape": shape,
            "nodata_val": nodata_val,
            "transform": transform,
            "crs": crs,
            "dtype": raster_data.dtype
        }
        # Convert to numpy arrays
        x_coords = np.array(xs)
        y_coords = np.array(ys)
        # Create array of coordinate tuples
        points_array = np.array([Point(x, y) for x, y in zip(x_coords, y_coords)])
    return points_array, non_nan_indices, raster_metadata 

def store_array(array, variable_info, raster_metadata):
    # Connect to database
    conn = None
    try:
        # Create an in-memory GeoTIFF file from the NumPy array
        with MemoryFile() as memfile:
            with memfile.open(
                driver='GTiff',
                height=array.shape[0],
                width=array.shape[1],
                count=1, 
                dtype=str(array.dtype),
                crs=raster_metadata['crs'],
                transform=raster_metadata['transform'],
                nodata=raster_metadata['nodata_val']
            ) as dataset:
                dataset.write(array, 1) 
            # Read the raw bytes from the in-memory file
            raster_bytes = memfile.read()

        # Connect to the database
        conn = psycopg2.connect(host=host, user=user, password=password, dbname=dbname)
        cur = conn.cursor()

        sql_insert = """
                    INSERT INTO precalc_ivs (indep_var, buffer_type, radius, raster_data)
                    VALUES (%s, %s, %s, ST_SetSRID(ST_FromGDALRaster(%s), %s));
                """
        data_to_insert = (
            variable_info['base_var_name'],
            variable_info['type'],
            variable_info['radius'],
            Binary(raster_bytes),
            7801  # Bulgarian SRID
        )
        
        cur.execute(sql_insert, data_to_insert)
        conn.commit()
    except (Exception, psycopg2.Error) as e:
        print(f"Database error: {e}")
        if conn:
            conn.rollback()
    finally:
        if conn:
            cur.close()
            conn.close()

# Calculate points and transform for pre-calculated maps
map_points, non_nan_indices, raster_metadata = create_input_points()
print(f'Mapping {len(map_points)} points')
# Maps for elevation and distance variables are pre-calculated (but not log)
map_vars = indep_vars[:-3]
# Initialize list to store completed calculations
success_list = []

# Iterate over each variable
for map_var in tqdm(map_vars):
    print(f'Computing maps for {map_var["name"]}...')
    # Run calculate points for the whole variable
    list_of_var_maps = calculate_points(map_var, map_points, success_list, skip_config=skip_config)
    # Iterate over the list saved in memory and save to postgres
    print(f'Maps Computed. Uploading to Postgres...')
    for var_map in list_of_var_maps:
        calc_name = var_map.name
        # Convert series to numpy array
        var_array = var_map.to_numpy()
        # Log transform and standardize maps using station values
        variable_info = iv_info_dict[calc_name]
        var_mean = variable_info['mean']
        var_std = variable_info['st_dev']
        log_t_array = np.log1p(var_array)
        # Apply the standardization using the stored mean and st_dev
        if var_std > 0:
            standardized_array = (log_t_array - var_mean) / var_std
        else:
            # Handle case with no variance
            standardized_array = log_t_array - var_mean
        # Reconstruct data to array shape
        final_raster = np.full(raster_metadata['shape'], raster_metadata['nodata_val'], dtype=np.float32)
        final_raster[non_nan_indices] = standardized_array
        # Save final raster in postgres
        store_array(final_raster, variable_info, raster_metadata)
    print('-' * 20)

# Save success list after all variables are calculated
with open(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\success_list.txt', "w") as file:
    for item in success_list:
        file.write(str(item) + "\n")

Mapping 85591 points


'\n# Iterate over each variable\nfor map_var in tqdm(map_vars):\n    print(f\'Computing maps for {map_var["name"]}...\')\n    # Run calculate points for the whole variable\n    list_of_var_maps = calculate_points(map_var, map_points, success_list, skip_config=skip_config)\n    # Iterate over the list saved in memory and save to postgres\n    print(f\'Maps Computed. Uploading to Postgres...\')\n    for var_map in list_of_var_maps:\n        calc_name = var_map.name\n        # Convert series to numpy array\n        var_array = var_map.to_numpy()\n        # Log transform and standardize maps using station values\n        variable_info = iv_info_dict[calc_name]\n        var_mean = variable_info[\'mean\']\n        var_std = variable_info[\'st_dev\']\n        log_t_array = np.log1p(var_array)\n        # Apply the standardization using the stored mean and st_dev\n        if var_std > 0:\n            standardized_array = (log_t_array - var_mean) / var_std\n        else:\n            # Handle ca

In [69]:
# Upload precalculated variables to postgres
precalc_vars = [{'name':'elevation','file':f_elevation_precalc},
    {'name':'rd_dis','file':f_roaddis_precalc,},
    {'name':'bus_dis','file':f_busstop_precalc}]
# raster_metadata = from previous

def store_precalcs(array_file, name, raster_metadata):
    with rasterio.open(array_file) as src:
        # Read relevant data for raster
        array = src.read(1)
    
        # Connect to database
        conn = None
        try:
            # Create an in-memory GeoTIFF file from the NumPy array
            with MemoryFile() as memfile:
                with memfile.open(
                    driver='GTiff',
                    height=array.shape[0],
                    width=array.shape[1],
                    count=1, 
                    dtype=str(array.dtype),
                    crs=raster_metadata['crs'],
                    transform=raster_metadata['transform'],
                    nodata=raster_metadata['nodata_val']
                ) as dataset:
                    dataset.write(array, 1) 
                # Read the raw bytes from the in-memory file
                raster_bytes = memfile.read()
                print(array.shape[0],array.shape[1])
            # Connect to the database
            conn = psycopg2.connect(**DB_CONFIG)
            cur = conn.cursor()

            sql_insert = """
                        INSERT INTO precalc_ivs (indep_var, buffer_type, radius, raster_data)
                        VALUES (%s, %s, %s, ST_SetSRID(ST_FromGDALRaster(%s), %s));
                    """
            data_to_insert = (
                name,
                'cir', # Just give it cir to deal with enum
                0,
                Binary(raster_bytes),
                7801  # Bulgarian SRID
            )
            
            cur.execute(sql_insert, data_to_insert)
            conn.commit()
        except (Exception, psycopg2.Error) as e:
            print(f"Database error: {e}")
            if conn:
                conn.rollback()
        finally:
            if conn:
                cur.close()
                conn.close()

DB_CONFIG = {
    "host": "localhost",
    "dbname": "platform_db",
    "user": "postgres",
    "password": "postgres"
}

for pcv in precalc_vars:
    store_precalcs(pcv['file'], pcv['name'], raster_metadata)

354 376
354 376
354 376


In [27]:
# Read and convert success list to skip config if it isn't complete
success_list = []
with open(r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\success_list.txt', "r") as file:
    for line in file:
        success_list.append(line.strip())
if len(success_list) <= 471:
    print('Missed calculations')
    calculated_config = create_skip_config(success_list)
else:
    print(f'All {len(success_list)} calculations')

All 472 calculations


<center>Step 5: Set-up Data Platform Services<center>

In [None]:
# Set up servers to provide maps to terria
"""
cd C:\nginx
start nginx
"""
# Reset nginx (cmd admin)
"""
tasklist | findstr "nginx.exe"
taskkill /F /IM nginx.exe

"""
# Activate venv
"""
cd C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\.venv\Scripts
activate
"""
# Titiler
"""
cd C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\web_maps
set "PROJ_LIB=C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\.venv\Lib\site-packages\pyproj\proj_dir\share\proj" && uvicorn titiler_server:app --host 0.0.0.0 --port 8000
"""
# Catalog / tile proxy
"""
cd C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\web_maps
uvicorn catalog_fastapi:app --host 0.0.0.0 --port 8002 --reload
"""
# Postgres historical data
"""
cd C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method
uvicorn hist_data_api:app --host 0.0.0.0 --port 8003 --reload
"""

'<div class="tjs-legend__legendTitles" style="margin-top: 10px; text-align: center;">On average, the model predicts 11.91 µg/m³ away from actual values </div>'

In [11]:
# Upload geojson to sql server
import os
import json
import psycopg2
from collections import defaultdict
from psycopg2 import extras

def get_db_connection():
    """Establishes a connection to the PostgreSQL database."""
    try:
        # Get the database URL from environment variables, with a default fallback
        database_url = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/platform_db")
        conn = psycopg2.connect(database_url)
        print("✅ Database connection successful.")
        return conn
    except psycopg2.OperationalError as e:
        print(f"❌ Could not connect to the database: {e}")
        return None

def upload_geojson_features(conn, file_path, region_type):
    print(f"\nProcessing file: {file_path} as type '{region_type}'...")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            geojson_data = json.load(f)
    except Exception as e:
        print(f"❌ Error reading or parsing file {file_path}: {e}")
        return

    features = geojson_data.get('features')
    if not features:
        print(f"⚠️ Warning: No 'features' array found in {file_path}. Skipping.")
        return

    # --- NEW LOGIC: Use a dictionary to group geometries by name ---
    # defaultdict makes it easy to append to lists for new keys
    geometries_by_name = defaultdict(list)

    for feature in features:
        properties = feature.get('properties', {})
        geometry = feature.get('geometry')
        region_name = properties.get('name_en')

        if not all([region_name, geometry]):
            print(f"⚠️ Warning: Skipping a feature due to missing 'name_en' or 'geometry'.")
            continue
        
        geom_type = geometry.get('type')
        
        if geom_type == 'Polygon':
            geometries_by_name[region_name].append(json.dumps(geometry))
            
        elif geom_type == 'MultiPolygon':
            # Explode the MultiPolygon into its constituent Polygons
            for polygon_coords in geometry['coordinates']:
                new_polygon_geom = {"type": "Polygon", "coordinates": polygon_coords}
                geometries_by_name[region_name].append(json.dumps(new_polygon_geom))
    
    if not geometries_by_name:
        print("No valid features found to upload.")
        return

    # --- Prepare final data for a single upsert per region_name ---
    upload_data = []
    for name, geom_list in geometries_by_name.items():
        # We will pass the region name, type, and the LIST of GeoJSON strings
        upload_data.append((name, region_type, geom_list))

    # --- MODIFIED SQL QUERY AND TEMPLATE ---
    with conn.cursor() as cur:
        # This query uses ST_Collect to combine an array of geometries into one MultiPolygon
        # It is robust and handles cases with one or many polygons perfectly.
        query = """
            INSERT INTO analysis_regions (region_name, region_type, geom)
            VALUES %s
            ON CONFLICT (region_name) DO UPDATE SET
                geom = EXCLUDED.geom,
                region_type = EXCLUDED.region_type;
        """
        
        # The template now builds the geometry from an array of GeoJSON strings (%s)
        template = """
            (%s, %s, ST_SetSRID(
                ST_CollectionExtract(
                    ST_Collect(
                        ARRAY(SELECT ST_GeomFromGeoJSON(g) FROM unnest(%s) as g)
                    ), 3
                ), 4326)
            )
        """
        # ST_Collect gathers geometries. ST_CollectionExtract ensures the result is MultiPolygon (type 3).

        try:
            extras.execute_values(
                cur,
                query,
                upload_data,
                template=template,
                page_size=100
            )
            conn.commit()
            # The count now refers to unique regions, not individual polygons
            print(f"✅ Successfully uploaded/updated {len(upload_data)} regions from {file_path}.")
        except Exception as e:
            print(f"❌ Database error during upload: {e}")
            conn.rollback()

def main():
    """
    Main function to define files and orchestrate the upload process.
    """
    # --- IMPORTANT ---
    # Update this list with the paths to your GeoJSON files and their corresponding types.
    files_to_process = [
        {'path': r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\postgis\districts.geojson', 'type': 'district'},
        {'path': r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\postgis\lez.geojson', 'type': 'lez'}
    ]

    conn = get_db_connection()
    if conn:
        for item in files_to_process:
            upload_geojson_features(conn, item['path'], item['type'])
        conn.close()
        print("\nAll files processed. Database connection closed.")

if __name__ == "__main__":
    main()


✅ Database connection successful.

Processing file: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\postgis\districts.geojson as type 'district'...
✅ Successfully uploaded/updated 22 regions from C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\postgis\districts.geojson.

Processing file: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\postgis\lez.geojson as type 'lez'...
✅ Successfully uploaded/updated 1 regions from C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Spatial_Data\postgis\lez.geojson.

All files processed. Database connection closed.


In [3]:
# Read raster files and upload to postgres

DB_CONFIG = {
    "dbname": "platform_db",
    "user": "postgres",
    "password": "postgres",
    "host": "localhost",
    "port": "5432"
}

# Read master raster to get metadata
with rasterio.open(f_elevation_precalc) as src:
    # Read relevant data for raster
    raster_data = src.read(1)
    nodata_val = src.nodata
    transform = src.transform
    shape = src.shape
    crs = src.crs
    # Get indices of non-NaN values
    non_nan_indices = np.where(raster_data != nodata_val)
    rows, cols = non_nan_indices
    # Get coordinates using vectorized operation
    xs, ys = rasterio.transform.xy(transform, rows, cols)
    # Save raster metadata for writing
    raster_metadata = {
        "shape": shape,
        "nodata_val": nodata_val,
        "transform": transform,
        "crs": crs,
        "dtype": raster_data.dtype
    }

def store_array(array, variable_info, raster_metadata):
    # Connect to database
    conn = None
    try:
        # Create an in-memory GeoTIFF file from the NumPy array
        with MemoryFile() as memfile:
            with memfile.open(
                driver='GTiff',
                height=array.shape[0],
                width=array.shape[1],
                count=1, 
                dtype=str(array.dtype),
                crs=raster_metadata['crs'],
                transform=raster_metadata['transform'],
                nodata=raster_metadata['nodata_val']
            ) as dataset:
                dataset.write(array, 1) 
            # Read the raw bytes from the in-memory file
            raster_bytes = memfile.read()

        # Connect to the database
        conn = psycopg2.connect(**DB_CONFIG)
        cur = conn.cursor()

        sql_insert = """
                    INSERT INTO precalc_ivs (indep_var, buffer_type, radius, raster_data)
                    VALUES (%s, %s, %s, ST_SetSRID(ST_FromGDALRaster(%s), %s));
                """
        data_to_insert = (
            variable_info['base_var_name'],
            variable_info['type'],
            variable_info['radius'],
            Binary(raster_bytes),
            7801
        )
        cur.execute(sql_insert, data_to_insert)
        conn.commit()
    except (Exception, psycopg2.Error) as e:
        print(f"Database error: {e}")
        if conn:
            conn.rollback()
    finally:
        if conn:
            cur.close()
            conn.close()

raster_folder = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\export3"
for filename in os.listdir(raster_folder):
    file_path = os.path.join(raster_folder, filename)
    # Get raster
    with rasterio.open(file_path) as src:
        array = src.read(1)
    
    # Parse filenames
    file_basename = os.path.basename(file_path)
    no_ex = os.path.splitext(file_basename)[0]
    parts = no_ex.split('_')
    # Set values for 3 precalcs
    radius, buffer_type, base_name = 0, 'cir', no_ex
    # Set values for others
    if len(parts) >= 3 and parts[-2].endswith('m'):
        radius = int(parts[-2][:-1])
        buffer_type = parts[-1]
        base_name = '_'.join(parts[:-2])
    file_dict = {'base_var_name':base_name, 'type':buffer_type, 'radius':radius}
    print(file_dict)

    store_array(array, file_dict, raster_metadata)
    

{'base_var_name': 'build_fp', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'build_fp', 'type': 'cir', 'radius': 25}
{'base_var_name': 'build_var', 'type': 'cir', 'radius': 25}
{'base_var_name': 'build_vol', 'type': 'cir', 'radius': 100}
{'base_var_name': 'build_vol', 'type': 'cir', 'radius': 500}
{'base_var_name': 'lu_hdr', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'lu_ind', 'type': 'cir', 'radius': 500}
{'base_var_name': 'lu_ldr', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'lu_rur', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'lu_ug', 'type': 'cir', 'radius': 500}
{'base_var_name': 'pop', 'type': 'cir', 'radius': 500}
{'base_var_name': 'rd_mt', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'rd_mt', 'type': 'cir', 'radius': 4000}
{'base_var_name': 'rd_prim', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'rd_res', 'type': 'cir', 'radius': 1500}
{'base_var_name': 'rd_sec', 'type': 'cir', 'radius': 4000}
{'base_var_name': 'rd_ter', 'type': 'cir', 'radius': 1500

In [None]:
# Calculate step 1 rasters, save results and array as npy

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import re
from collections import defaultdict
import psycopg2
import rasterio
from rasterio.io import MemoryFile
import os
import json

# ==============================================================================
# 0. CONFIGURATION
# ==============================================================================
# --- User Settings ---
POLLUTANT = 'NO2'
START_DATE = '2022-09-01'
END_DATE = '2025-06-01'
LOG_TRANSFORM_Y = True  # Apply log(y) transformation for better model fit
LOG_CONSTANT = 1e-9 # Small consprepare_dependent_variabletant to avoid log(0)
MIN_MODEL_VARS = 4 # Minimum number of variables to keep during backward elimination

# --- File Paths ---
SENSOR_DATA_PATH = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\long_sensor_measurements5.csv"
IV_DATA_PATH = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\iv_values_process4.csv"
OUTPUT_DIR = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\stage1"
WIND_FREQ_DIR = os.path.join(OUTPUT_DIR, 'wind_frequency')



# 1: Positive correlation expected, -1: Negative correlation expected, 0: No constraint
SIGN_CONSTRAINTS = {
        'O3': {'lu_hdr':-1, 'lu_ldr':-1, 'lu_ind':-1, 'lu_ug':1, 'lu_art':-1, 'lu_for':1, 'lu_rur':1, 'build_fp': 0, 'build_vol':0, 'build_var':0, 'ndvi':1, 'elevation':0, 'pop':0, 'rd_mt':-1, 'rd_prim':-1, 'rd_res':-1, 'rd_sec':-1, 'rd_ter':-1, 'rd_dis':1, 'bus_dis':0},
        'NO2':{'lu_hdr':1, 'lu_ldr':1, 'lu_ind':1, 'lu_ug':-1, 'lu_art':1, 'lu_for':-1, 'lu_rur':-1, 'build_fp': 1, 'build_vol':1, 'build_var':-1, 'ndvi':-1, 'elevation':-1, 'pop':1, 'rd_mt':1, 'rd_prim':1, 'rd_res':1, 'rd_sec':1, 'rd_ter':1, 'rd_dis':-1, 'bus_dis':-1},
        'PM': {'lu_hdr':1, 'lu_ldr':1, 'lu_ind':1, 'lu_ug':-1, 'lu_art':1, 'lu_for':-1, 'lu_rur':-1, 'build_fp': 1, 'build_vol':1, 'build_var':-1, 'ndvi':-1, 'elevation':-1, 'pop':1, 'rd_mt':1, 'rd_prim':1, 'rd_res':1, 'rd_sec':1, 'rd_ter':1, 'rd_dis':-1, 'bus_dis':-1},
        'SO2':{'lu_hdr':0, 'lu_ldr':0, 'lu_ind':1, 'lu_ug':-1, 'lu_art':0, 'lu_for':-1, 'lu_rur':-1, 'build_fp': 1, 'build_vol':1, 'build_var':-1, 'ndvi':-1, 'elevation':0, 'pop':1, 'rd_mt':1, 'rd_prim':1, 'rd_res':1, 'rd_sec':1, 'rd_ter':1, 'rd_dis':-1, 'bus_dis':-1}
    }
# --- Database Connection ---
# Replace with your actual PostgreSQL connection details
DB_CONFIG = {
    "dbname": "platform_db",
    "user": "postgres",
    "password": "postgres",
    "host": "localhost",
    "port": "5432"
}

# ==============================================================================
# 1. HELPER & DATABASE FUNCTIONS
# ==============================================================================

def get_wind_direction_sector(degrees):
    """Convert wind direction in degrees to a sub-cardinal category."""
    if pd.isna(degrees): return "Unknown"
    degrees = degrees % 360
    if 337.5 <= degrees or degrees < 22.5: return 'N'
    if 22.5 <= degrees < 67.5: return 'NE'
    if 67.5 <= degrees < 112.5: return 'E'
    if 112.5 <= degrees < 157.5: return 'SE'
    if 157.5 <= degrees < 202.5: return 'S'
    if 202.5 <= degrees < 247.5: return 'SW'
    if 247.5 <= degrees < 292.5: return 'W'
    if 292.5 <= degrees < 337.5: return 'NW'
    return "Unknown"

def create_iv_selection_map(df):
    """Reads IV dataframe columns and creates a selection map for efficient querying."""
    column_lookup = defaultdict(dict)
    col_pattern = re.compile(r"^(.*)_(\d+)m_(\w+)$")
    for col in [c for c in df.columns if c != 'id']:
        match = col_pattern.match(col)
        if match:
            base_var, radius, buffer_type = match.groups()
            column_lookup[(base_var, radius)][buffer_type] = col
    return {'column_lookup': dict(column_lookup)}

def get_db_connection():
    """Establishes and returns a psycopg2 database connection."""
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        print("Successfully connected to PostgreSQL database.")
        return conn
    except psycopg2.OperationalError as e:
        print(f"🔴 Error: Could not connect to the database. {e}")
        return None

def fetch_raster_from_db(conn, var_name, radius, sector):

    query = """
    SELECT ST_AsGDALRaster(raster_data, 'GTiff')
    FROM precalc_ivs
    WHERE indep_var = %s AND radius = %s AND buffer_type = %s;
    """
    with conn.cursor() as curs:
        curs.execute(query, (var_name, radius, sector))
        result = curs.fetchone()
        if result and result[0] is not None:
            # The result is a memoryview (bytes), which MemoryFile can open
            with MemoryFile(result[0]) as memfile:
                with memfile.open() as dataset:
                    return dataset.read(1), dataset.profile # Return numpy array and profile
    return None, None # Return None if raster not found

# ==============================================================================
# 2. WIND & DATA PREPARATION
# ==============================================================================

def calculate_hourly_average_wind(df):
    """
    Calculates the circular mean of wind direction for each hour across all sensors.
    Returns a Series of hourly average wind directions.
    """
    print("Calculating hourly average wind direction across all sensors...")
    wd_df = df[df['measurement'] == 'WD'].copy()
    wd_df.dropna(subset=['reading_value'], inplace=True)
    
    # Convert degrees to radians for circular mean calculation
    wd_df['radians'] = np.deg2rad(wd_df['reading_value'])
    wd_df['sin_rad'] = np.sin(wd_df['radians'])
    wd_df['cos_rad'] = np.cos(wd_df['radians'])
    
    # Group by hour and calculate sum of sin/cos components
    hourly_components = wd_df.groupby(wd_df['measured_time'].dt.floor('h'))[['sin_rad', 'cos_rad']].sum()
    
    # Calculate circular mean in radians, then convert back to degrees
    circ_mean_rad = np.arctan2(hourly_components['sin_rad'], hourly_components['cos_rad'])
    circ_mean_deg = np.rad2deg(circ_mean_rad)
    
    # Normalize to 0-360 range
    avg_hourly_wind_deg = (circ_mean_deg + 360) % 360
    avg_hourly_wind_deg.name = 'avg_WD'
    
    return avg_hourly_wind_deg

def calculate_yearly_wind_distribution(avg_hourly_wind):
    """
    NEW: Calculates the overall wind frequency distribution for the entire year.
    Returns a dictionary of sector percentages (e.g., {'N': 0.12, ...}).
    """
    print("\nCalculating yearly wind frequency distribution...")
    sector_counts = avg_hourly_wind.apply(get_wind_direction_sector).value_counts()
    distribution = (sector_counts / sector_counts.sum()).to_dict()
    
    # Ensure all sectors are present
    for sector in ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']:
        if sector not in distribution:
            distribution[sector] = 0.0

    print("Yearly Distribution (%):")
    for sector, freq in distribution.items():
        print(f"  - {sector}: {freq:.2%}")
        
    return distribution

def prepare_dependent_variable(sensor_df, pollutant, start_date, end_date):
    """
    Stage 1: Processes hourly sensor data, applying the network-wide average
    wind direction to create the dependent variable.
    """
    print(f"\n--- Stage 1: Preparing Dependent Variable ({pollutant}) ---")
    df = sensor_df.copy()
    df['measured_time'] = pd.to_datetime(df['measured_time'])
    df = df[(df['measured_time'] >= start_date) & (df['measured_time'] < end_date)]
    
    # Calculate the single average wind direction for each hour
    avg_hourly_wind = calculate_hourly_average_wind(df)
    
    # Pivot pollutant data and merge with the average hourly wind
    pivot_df = df.pivot_table(index=['measured_time', 'station_id'], columns='measurement', values='reading_value').reset_index()
    pivot_df['hour'] = pivot_df['measured_time'].dt.floor('h')
    
    merged_df = pd.merge(pivot_df, avg_hourly_wind, left_on='hour', right_index=True)
    merged_df.dropna(subset=[pollutant, 'avg_WD'], inplace=True)
    
    # Assign each measurement to a wind sector based on the average wind
    merged_df['WD_Sector'] = merged_df['avg_WD'].apply(get_wind_direction_sector)
    merged_df = merged_df[merged_df['WD_Sector'] != 'Unknown']
    
    sector_averages = merged_df.groupby(['station_id', 'WD_Sector'])[pollutant].mean().reset_index()
    sector_averages.rename(columns={pollutant: f'avg_{pollutant}'}, inplace=True)
    
    print(f"Generated {len(sector_averages)} station-sector training data points.")
    return sector_averages, avg_hourly_wind

def prepare_training_dataset(sector_averages_df, all_station_ivs_df):
    """
    Stage 2: Joins dependent variables with independent variables.
    This version is rewritten to prevent data duplication and correctly handle all variable types.
    """
    print("\n--- Stage 2: Preparing Full Training Dataset ---")

    # Use station ID as the index for fast lookups
    ivs_indexed = all_station_ivs_df.set_index('id')
    
    # Pre-parse all available IV column names into a more useful structure
    all_iv_cols = {col: re.match(r"^(.*)_(\d+)m_(\w+)$", col) for col in ivs_indexed.columns}
    
    # This list will hold a dictionary for each row of the final DataFrame
    training_data_rows = []

    for _, row in sector_averages_df.iterrows():
        station_id = row['station_id']
        sector = row['WD_Sector']
        
        # Start building the row for our new DataFrame
        new_row = {'station_id': station_id, 'WD_Sector': sector, f"avg_{POLLUTANT}": row[f"avg_{POLLUTANT}"]}

        # Get all IVs for the current station
        station_ivs = ivs_indexed.loc[station_id]

        # A dictionary to hold the generic IVs for this station-sector combo
        generic_ivs = {}

        # First, process buffered variables
        for col, match in all_iv_cols.items():
            if match:
                base_var, radius, buffer_type = match.groups()
                generic_name = f"{base_var}_{radius}m"
                # If this column matches the current sector, add its value
                if buffer_type == sector:
                    generic_ivs[generic_name] = station_ivs[col]
        
        # Now, fill in any missing values with the 'cir' version as a fallback
        for col, match in all_iv_cols.items():
            if match:
                base_var, radius, buffer_type = match.groups()
                generic_name = f"{base_var}_{radius}m"
                if generic_name not in generic_ivs and buffer_type == 'cir':
                    generic_ivs[generic_name] = station_ivs[col]

        # Add non-buffered variables like 'elevation', 'rd_dis', 'bus_dis'
        # These are stored with buffer_type 'cir' but have no radius in their generic name
        non_buffer_vars = [c for c in ivs_indexed.columns if all_iv_cols.get(c) is None]
        for var in non_buffer_vars:
            generic_ivs[var] = station_ivs[var]
            
        new_row.update(generic_ivs)
        training_data_rows.append(new_row)

    full_training_df = pd.DataFrame(training_data_rows)
    print(f"Successfully joined IVs for {len(full_training_df)} data points.")
    return full_training_df

# ==============================================================================
# 3. MODEL TRAINING
# ==============================================================================

def model_calibration(df, dv_name, a_priori_signs, min_vars=3):
    """
    Stage 3: Performs supervised stepwise model calibration.
    Now includes a `min_vars` parameter to control backward elimination.
    """
    print("\n--- Stage 3: Training LUR Model ---")
    # (Forward selection part is identical to the previous version)
    iv_names = [col for col in df.columns if col not in [dv_name, 'id', 'station_id', 'WD_Sector']]
    df_clean = df.dropna(subset=[dv_name] + iv_names).copy()
    if len(df_clean) < min_vars + 2:
        print("Not enough data for model calibration."); return None
    y, X_all = df_clean[dv_name], df_clean[iv_names]
    def get_X_with_const(X_df): return sm.add_constant(X_df, has_constant='add')
    def parse_base_var(col_name):
        parts = col_name.split('_'); return '_'.join(parts[:-1]) if len(parts) > 1 and parts[-1].endswith('m') else col_name
    def check_signs(coeffs, a_priori):
        for var, coeff in coeffs.drop('const', errors='ignore').items():
            if (a_priori.get(parse_base_var(var), 0) == 1 and coeff < 0) or \
               (a_priori.get(parse_base_var(var), 0) == -1 and coeff > 0): return False
        return True
    best_initial_var, max_r2 = None, -1.0
    for iv in iv_names:
        model = sm.OLS(y, get_X_with_const(X_all[[iv]])).fit()
        if check_signs(model.params, a_priori_signs) and model.rsquared > max_r2: max_r2, best_initial_var = model.rsquared, iv
    if not best_initial_var: print("No suitable starting variable found."); return None
    included_vars, current_r2 = [best_initial_var], max_r2
    while True:
        best_new_var, best_r2_increase = None, 0.0
        for iv in [v for v in iv_names if v not in included_vars]:
            model = sm.OLS(y, get_X_with_const(X_all[included_vars + [iv]])).fit()
            r2_increase = model.rsquared - current_r2
            if r2_increase >= 0.01 and check_signs(model.params, a_priori_signs) and r2_increase > best_r2_increase:
                best_r2_increase, best_new_var = r2_increase, iv
        if best_new_var: included_vars.append(best_new_var); current_r2 += best_r2_increase
        else: break
    
    # Backward elimination, now respecting min_vars
    final_vars = included_vars.copy()
    # This loop now stops if the number of variables reaches the minimum threshold
    while len(final_vars) > min_vars:
        X_final_const = get_X_with_const(X_all[final_vars])
        pvalues = sm.OLS(y, X_final_const).fit().pvalues.drop('const', errors='ignore')
        vif = pd.Series([variance_inflation_factor(X_final_const.values, i) for i in range(X_final_const.shape[1])], index=X_final_const.columns)
        
        if vif.drop('const', errors='ignore').max() > 10:
            final_vars.remove(vif.drop('const', errors='ignore').idxmax())
            continue
        if pvalues.max() > 0.05:
            final_vars.remove(pvalues.idxmax())
            continue
        break
        
    final_model = sm.OLS(y, get_X_with_const(X_all[final_vars])).fit()
    print("\n--- Final Model Summary ---")
    print(final_model.summary())
    return final_model

# ==============================================================================
# 4. MAP GENERATION
# ==============================================================================

def calculate_final_map(final_model, conn, wind_distribution, LOG_TRANSFORM_Y):
    """
    Stage 4: Fetches rasters, manually creates masks to handle NoData values,
    performs algebra, and saves the final map.
    """
    print("\n--- Stage 4: Calculating Final Map ---")
    if final_model is None:
        print("Model training failed, cannot generate map."); return
        
    coeffs = final_model.params.to_dict()
    intercept = coeffs.pop('const', 0.0)
    
    print("Fetching template raster for map profile...")
    template_arr, template_profile = fetch_raster_from_db(conn, 'elevation', 0, 'cir')
    if template_profile is None:
        print("Could not fetch a template raster to define map extent."); return
    
    # --- NEW LOGIC: Manually create the mask from the NoData value ---
    nodata_value = template_profile.get('nodata')
    
    if nodata_value is not None:
        # Create a boolean mask where True means the pixel is NoData
        initial_mask = (template_arr == nodata_value)
    else:
        # If no nodata value is defined, assume all data is valid
        initial_mask = np.zeros_like(template_arr, dtype=bool)

    # Initialize the final map as a masked array using the created mask
    final_map_array = np.ma.array(
        np.full((template_profile['height'], template_profile['width']), intercept, dtype=np.float32),
        mask=initial_mask
    )
    
    for var_name, coeff in coeffs.items():
        print(f"Processing variable: {var_name}")
        
        # Initialize the master predictor array for this variable, also masked
        master_predictor_array = np.ma.array(
            np.zeros((template_profile['height'], template_profile['width']), dtype=np.float32),
            mask=initial_mask
        )
        
        match = re.match(r"^(.*)_(\d+)m$", var_name)
        if match:
            # Buffered variable
            base, radius = match.groups()
            for sector, frequency in wind_distribution.items():
                if frequency == 0: continue
                var_arr, _ = fetch_raster_from_db(conn, base, radius, sector)
                if var_arr is None: var_arr, _ = fetch_raster_from_db(conn, base, radius, 'cir')
                if var_arr is None:
                    print(f"  - WARNING: Could not find raster for {var_name}. Skipping term for sector {sector}.")
                    continue
                # Masked array arithmetic automatically handles NoData alignment
                master_predictor_array += (var_arr * frequency)
        else:
            # Non-buffered variable
            var_arr, _ = fetch_raster_from_db(conn, var_name, 0, 'cir')
            if var_arr is None:
                print(f"  - WARNING: Could not find non-buffered raster for {var_name}. Skipping variable.")
                continue
            master_predictor_array = var_arr

        final_map_array += (master_predictor_array * coeff)

    if LOG_TRANSFORM_Y:
        print("Exponentiating final map and subtracting LOG_CONSTANT...")
        final_map_array = np.ma.exp(final_map_array) - LOG_CONSTANT
        
    # Fill the masked values with the NoData value before saving
    output_array = final_map_array.filled(nodata_value)

    # --- START of NEW code to save array to disk ---
    #csv_map_path = os.path.join(OUTPUT_DIR, f'LUR_MAP_{POLLUTANT}.csv')
    npy_map_path = os.path.join(OUTPUT_DIR, f'LUR_MAP_{POLLUTANT}.npy')
    FINAL_MAP_PATH = os.path.join(OUTPUT_DIR, f'LUR_MAP_{POLLUTANT}.tif')

    # Save the final map array as a NumPy .npy file (Recommended)
    print(f"Saving final map array to '{npy_map_path}'...")
    np.save(npy_map_path, output_array)

    print(f"Saving final map to '{FINAL_MAP_PATH}'...")
    profile = template_profile.copy()
    profile.update({
        'driver': 'COG',
        'dtype': 'float32',
        'nodata': nodata_value,
        'compress': 'LZW',  # Use a lossless compression
        'tiled': True,       # COGs must be tiled
        'blockxsize': 256,   # Tile size
        'blockysize': 256
    })

    with rasterio.open(FINAL_MAP_PATH, 'w', **profile) as dst:
        from rasterio.enums import Resampling
        dst.write(output_array.astype(profile['dtype']), 1)
        # Build overviews (pyramids) for faster rendering at different zoom levels
        factors = [2, 4, 8, 16] 
        dst.build_overviews(factors, Resampling.average)

def save_model_results_to_json(final_model, pollutant_name):
    """
    Saves the key model results (R-squared, variables, coefficients) to a JSON file.
    """
    if final_model is None:
        print("No model results to save.")
        return

    print("\n--- Stage 5: Saving Model Results to JSON ---")
    
    # Define the output path
    results_path = os.path.join(OUTPUT_DIR, f'LUR_MODEL_RESULTS_{pollutant_name}.json')

    # Extract the required information from the model object
    params = final_model.params.to_dict()
    
    results = {
        "pollutant": pollutant_name,
        "adjusted_r_squared": final_model.rsquared_adj,
        "intercept": params.pop('const', None),
        "coefficients": params
    }

    # Write the dictionary to a JSON file
    try:
        with open(results_path, 'w') as f:
            json.dump(results, f, indent=4)
        print(f"Successfully saved model results to '{results_path}'")
    except Exception as e:
        print(f"🔴 Error saving results to JSON: {e}")

def save_station_predictions(model, training_data, dv_name, pollutant, output_dir, log_transform, log_constant):
    """
    NEW: Uses the final model to predict values for the training data points
    and saves the observed vs. predicted values to a CSV file.
    """
    if model is None:
        print("No model available, skipping prediction saving.")
        return

    print("\n--- Saving Station Predictions ---")

    # Isolate the predictor variables used in the final model
    predictor_vars = [v for v in model.params.index if v != 'const']
    X_predict = training_data[predictor_vars]
    X_predict = sm.add_constant(X_predict, has_constant='add')

    # Generate predictions on the transformed scale
    predictions_transformed = model.predict(X_predict)

    # Create a results DataFrame
    results_df = training_data[['station_id', 'WD_Sector']].copy()
    results_df['observed_transformed'] = training_data[dv_name]
    results_df['predicted_transformed'] = predictions_transformed

    # Back-transform if necessary to get values in original units
    if log_transform:
        results_df['observed_value'] = np.exp(results_df['observed_transformed']) - log_constant
        results_df['predicted_value'] = np.exp(results_df['predicted_transformed']) - log_constant
    else:
        results_df['observed_value'] = results_df['observed_transformed']
        results_df['predicted_value'] = results_df['predicted_transformed']

    # Select and rename final columns for clarity
    output_df = results_df[['station_id', 'WD_Sector', 'observed_value', 'predicted_value']].copy()
    output_df.rename(columns={
        'observed_value': f'observed_{pollutant}',
        'predicted_value': f'predicted_{pollutant}'
    }, inplace=True)

    # Save to CSV
    output_path = os.path.join(output_dir, f'STATION_PREDICTIONS_{pollutant}.csv')
    try:
        output_df.to_csv(output_path, index=False)
        print(f"Successfully saved station predictions to '{output_path}'")
    except Exception as e:
        print(f"🔴 Error saving station predictions: {e}")

# ==============================================================================
# 5. SCRIPT EXECUTION
# ==============================================================================

def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print("🚀 Starting Yearly LUR Model & Map Generation...")
    
    conn = get_db_connection()
    if not conn: return

    try:
        sensor_df = pd.read_csv(SENSOR_DATA_PATH)
        iv_names_df = pd.read_csv(IV_DATA_PATH)
        #precomputed_maps = create_iv_selection_map(iv_names_df)

        # Stage 1 now also returns the hourly wind data
        dv_df, avg_hourly_wind = prepare_dependent_variable(sensor_df, POLLUTANT, START_DATE, END_DATE)
        
        # Calculate the single wind distribution for the whole year
        wind_distribution = calculate_yearly_wind_distribution(avg_hourly_wind)
        
        training_df = prepare_training_dataset(dv_df, iv_names_df)
        
        dv_col_name = f'avg_{POLLUTANT}'
        if LOG_TRANSFORM_Y:
            training_df[dv_col_name] = np.log(training_df[dv_col_name] + LOG_CONSTANT)
        
        final_model = model_calibration(
            df=training_df,
            dv_name=dv_col_name,
            a_priori_signs=SIGN_CONSTRAINTS.get(POLLUTANT, {}),
            min_vars=MIN_MODEL_VARS
        )
        
        #save_model_results_to_json(final_model, POLLUTANT)
        # Pass the calculated distribution to the mapping function
        calculate_final_map(final_model, conn, wind_distribution, LOG_TRANSFORM_Y=rue)
        
        save_station_predictions(
            model=final_model,
            training_data=training_df,
            dv_name=dv_col_name,
            pollutant=POLLUTANT,
            output_dir=OUTPUT_DIR,
            log_transform=LOG_TRANSFORM_Y,
            log_constant=LOG_CONSTANT
        )

        print("\n✅ LUR process completed successfully!")

    finally:
        if conn:
            conn.close()
            print("\nDatabase connection closed.")

if __name__ == '__main__':
    main()

In [5]:
# setup config with only files that I need

import json
"""
def create_raster_config(total_vars, skip_config):
    vars_to_calculate = {}

    # --- 1. Identify unique variables and their buffer types ---
    unique_keys = set(total_vars.keys())

    for key in unique_keys:
        parts = key.split('_')
        
        # Handle variables with no radius (e.g., 'bus_dis', 'elevation')
        if len(parts) < 2 or not parts[-1][:-1].isdigit():
            var_name = key
            if var_name not in vars_to_calculate:
                vars_to_calculate[var_name] = {"all": True}
            continue

        # Extract variable name and radius
        radius_str = parts[-1][:-1]
        var_name = "_".join(parts[:-1])
        radius = int(radius_str)

        # --- 2. Apply rules to determine buffer type (sector vs. circle) ---
        buffer_type = 'circle' # Default to circle
        if var_name == 'pop':
            if radius >= 500:
                buffer_type = 'sector'
        elif radius >= 100:
            buffer_type = 'sector'

        # --- 3. Populate the dictionary of variables to be calculated ---
        if var_name not in vars_to_calculate:
            vars_to_calculate[var_name] = {'circle': [], 'sector': []}
        
        if radius not in vars_to_calculate[var_name][buffer_type]:
            vars_to_calculate[var_name][buffer_type].append(radius)

    # Clean up empty lists from the calculation dictionary
    for var in list(vars_to_calculate.keys()):
        if isinstance(vars_to_calculate[var], dict):
            if 'circle' in vars_to_calculate[var] and not vars_to_calculate[var]['circle']:
                del vars_to_calculate[var]['circle']
            if 'sector' in vars_to_calculate[var] and not vars_to_calculate[var]['sector']:
                del vars_to_calculate[var]['sector']
        if not vars_to_calculate[var]:
             del vars_to_calculate[var]


    print("--- Variables that will be calculated ---")
    print(json.dumps(vars_to_calculate, indent=4))
    print("\n" + "="*40 + "\n")


    # --- 4. Update the skip_config by removing the variables we want to calculate ---
    new_skip_config = json.loads(json.dumps(skip_config)) # Deep copy

    for var, details in vars_to_calculate.items():
        if "all" in details and details["all"]:
             if var in new_skip_config:
                del new_skip_config[var]
        elif var in new_skip_config:
            if 'circle' in details:
                for radius in details['circle']:
                    if 'circle' in new_skip_config[var] and radius in new_skip_config[var]['circle']:
                        new_skip_config[var]['circle'].remove(radius)
            if 'sector' in details:
                 for radius in details['sector']:
                    if 'sector' in new_skip_config[var] and radius in new_skip_config[var]['sector']:
                        new_skip_config[var]['sector'].remove(radius)
    
    return new_skip_config
"""
    
def create_raster_config(total_vars, skip_config):
    """
    Processes a dictionary of variables to determine which rasters to create
    and updates a skip configuration dictionary accordingly. This version
    expects the input dictionary to be pre-sorted by buffer type ('cir'/'dir').

    Args:
        total_vars (dict): A dictionary with buffer types as keys ('cir', 'dir')
                           and nested dictionaries of variables.
        skip_config (dict): A dictionary specifying which variables and radii to skip.

    Returns:
        dict: The updated skip configuration dictionary.
    """
    vars_to_calculate = {}
    buffer_map = {'cir': 'circle', 'dir': 'sector'}

    # --- 1. Iterate through pre-sorted buffer types ('cir', 'dir') ---
    for buffer_key, var_dict in total_vars.items():
        buffer_type = buffer_map.get(buffer_key)
        if not buffer_type:
            print(f"Warning: Unknown buffer key '{buffer_key}' found. Skipping.")
            continue

        for key in var_dict.keys():
            parts = key.split('_')

            # Handle variables with no radius (e.g., 'elevation')
            # This is less likely in the new format but kept for robustness.
            if len(parts) < 2 or not parts[-1][:-1].isdigit():
                var_name = key
                if var_name not in vars_to_calculate:
                    vars_to_calculate[var_name] = {"all": True}
                continue

            # --- 2. Extract variable name and radius ---
            radius_str = parts[-1][:-1]
            var_name = "_".join(parts[:-1])
            radius = int(radius_str)

            # --- 3. Populate the dictionary of variables to be calculated ---
            if var_name not in vars_to_calculate:
                vars_to_calculate[var_name] = {'circle': [], 'sector': []}
            
            if radius not in vars_to_calculate[var_name][buffer_type]:
                vars_to_calculate[var_name][buffer_type].append(radius)

    # Clean up empty lists from the calculation dictionary
    for var in list(vars_to_calculate.keys()):
        if isinstance(vars_to_calculate[var], dict):
            if 'circle' in vars_to_calculate[var] and not vars_to_calculate[var]['circle']:
                del vars_to_calculate[var]['circle']
            if 'sector' in vars_to_calculate[var] and not vars_to_calculate[var]['sector']:
                del vars_to_calculate[var]['sector']
        if not vars_to_calculate[var]:
             del vars_to_calculate[var]

    print("--- Variables that will be calculated ---")
    print(json.dumps(vars_to_calculate, indent=4))
    print("\n" + "="*40 + "\n")

    # --- 4. Update the skip_config by removing the variables we want to calculate ---
    new_skip_config = json.loads(json.dumps(skip_config)) # Deep copy

    for var, details in vars_to_calculate.items():
        if "all" in details and details["all"]:
             if var in new_skip_config:
                del new_skip_config[var]
        elif var in new_skip_config:
            if 'circle' in details:
                for radius in details['circle']:
                    if 'circle' in new_skip_config[var] and radius in new_skip_config[var]['circle']:
                        new_skip_config[var]['circle'].remove(radius)
            if 'sector' in details:
                 for radius in details['sector']:
                    if 'sector' in new_skip_config[var] and radius in new_skip_config[var]['sector']:
                        new_skip_config[var]['sector'].remove(radius)
    
    return new_skip_config


# --- Input Data ---
total = {
    "cir":{
        "ndvi_500m": 1.0993436419238383,
        "lu_ldr_25m": 2.088882488711456,
        "build_var_100m": -1.1389130737734485,
        "lu_ind_1500m": 0.985738831158744,
        "build_fp_100m": -1.933908087930283,
        "ndvi_100m": 3.166628031689653,
        "build_var_500m": -2.7931030146313676,
        "lu_ug_1500m": -0.20907980858671316},
    "dir":{"rd_ter_100m": 2.714926110703737}
}

# Load the skip_all_fr.json content
skip_all_fr = {
    "id": {"all": True},
    "lu_hdr": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "lu_ldr": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "lu_ind": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "lu_ug": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "lu_art": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "lu_for": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "lu_rur": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "build_fp": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "build_vol": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "build_var": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "ndvi": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "pop": {"circle": [25, 100, 500, 1500, 4000], "sector": [500, 1500, 4000]},
    "rd_mt": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "rd_prim": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "rd_res": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "rd_sec": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "rd_ter": {"circle": [25, 100, 500, 1500, 4000], "sector": [100, 500, 1500, 4000]},
    "elevation": {"all": True},
    "rd_dis": {"all": True},
    "bus_dis": {"all": True}
}


# --- Execute the logic and print the new config ---
new_skip_configuration = create_raster_config(total, skip_all_fr)

print("--- New Skip Configuration ---")
newpath = r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\skip_configs\skip_final2.json'
with open(newpath, 'w') as f: json.dump(new_skip_configuration, f, indent=4)


--- Variables that will be calculated ---
{
    "ndvi": {
        "circle": [
            500,
            100
        ]
    },
    "lu_ldr": {
        "circle": [
            25
        ]
    },
    "build_var": {
        "circle": [
            100,
            500
        ]
    },
    "lu_ind": {
        "circle": [
            1500
        ]
    },
    "build_fp": {
        "circle": [
            100
        ]
    },
    "lu_ug": {
        "circle": [
            1500
        ]
    },
    "rd_ter": {
        "sector": [
            100
        ]
    }
}


--- New Skip Configuration ---


In [9]:
# Prepare step 2 dataframe
def create_residual_table(measurement_df, stage1_preds_df):
    # Melt the predictions from wide to long format for easy merging
    stage1_preds_long = stage1_preds_df.melt(
        id_vars='station_id',
        var_name='measurement',
        value_name='stage1_pred'
    )
    print(f"Loaded {len(stage1_preds_df)} stations from prediction file.")

    # --- 2. Calculate Pollutant Residuals ---
    print("Calculating pollutant residuals...")
    pollutants = ['O3', 'NO2', 'SO2', 'PM10', 'PM25']
    pollutant_measurements = measurement_df[measurement_df['measurement'].isin(pollutants)].copy()

    # Merge observed values with their corresponding stage 1 spatial prediction
    merged_pollutants = pd.merge(
        pollutant_measurements,
        stage1_preds_long,
        on=['station_id', 'measurement'],
        how='left'
    )

    # Calculate the residual (observed - predicted)
    merged_pollutants['residual'] = merged_pollutants['reading_value'] - merged_pollutants['stage1_pred']

    # Create a new column name for the residual, e.g., 'O3' becomes 'O3_resid'
    merged_pollutants['measurement'] = merged_pollutants['measurement'] + '_resid'

    # --- 3. Pivot to Wide Format ---
    print("Pivoting residual data to wide format...")
    residuals_to_pivot = merged_pollutants[['measured_time', 'station_id', 'measurement', 'residual']]

    wide_residuals_df = residuals_to_pivot.pivot_table(
        index=['measured_time', 'station_id'],
        columns='measurement',
        values='residual'
    ).reset_index()
    wide_residuals_df.columns.name = None

    # --- 4. Drop Rows Where All Pollutants are Missing ---
    print("Cleaning up rows with no residual data...")
    pollutant_resid_cols = [p + '_resid' for p in pollutants]
    existing_pollutant_cols = [col for col in pollutant_resid_cols if col in wide_residuals_df.columns]
    
    initial_rows = len(wide_residuals_df)
    wide_residuals_df.dropna(subset=existing_pollutant_cols, how='all', inplace=True)
    print(f"Dropped {initial_rows - len(wide_residuals_df)} rows that had no pollutant data.")
    
    # Ensure time column is in datetime format for merging later
    wide_residuals_df['measured_time'] = pd.to_datetime(wide_residuals_df['measured_time'])

    print("\nResidual table creation complete.")
    return wide_residuals_df

measurement_df = pd.read_csv(r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\long_sensor_measurements5.csv")
stage1_preds_df = pd.read_csv(r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\stage1\stage1_preds.csv")
resid_df = create_residual_table(measurement_df,stage1_preds_df)
resid_df.to_csv(r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\long_sensor_resids.csv", index=False)

Loaded 31 stations from prediction file.
Calculating pollutant residuals...
Pivoting residual data to wide format...
Cleaning up rows with no residual data...
Dropped 0 rows that had no pollutant data.

Residual table creation complete.


In [7]:
# Collect Meteorology Dataset from ERA5
import cdsapi
import pandas as pd

def download_era5_batches(stations_filepath, output_dir):
    # --- 1. Setup and Configuration ---
    print("Reading station locations to define download area...")
    try:
        stations_df = pd.read_csv(stations_filepath)
    except FileNotFoundError:
        print(f"Error: Station file not found at '{stations_filepath}'")
        return

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    # Define the geographical area of interest with a small buffer
    lat_min, lat_max = stations_df['latitude'].min() - 0.1, stations_df['latitude'].max() + 0.1
    lon_min, lon_max = stations_df['longitude'].min() - 0.1, stations_df['longitude'].max() + 0.1
    
    # Define the date range for batch processing
    date_range = pd.date_range(start='2022-09-01', end='2025-06-01', freq='MS')

    # Initialize the CDS API client
    c = cdsapi.Client()

    # --- 2. Loop Through Months and Download Data ---
    for i in range(len(date_range) - 1):
        start_date = date_range[i]
        end_date = date_range[i+1] - pd.Timedelta(days=1)
        year = str(start_date.year)
        month = f"{start_date.month:02d}"
        
        download_target_file = os.path.join(output_dir, f"{year}_{month}.grib")
        
        if os.path.exists(download_target_file):
            print(f"File for {year}-{month} already exists. Skipping.")
            continue

        print(f"\n--- Requesting data for {year}-{month} ---")
        
        try:
            c.retrieve(
                'reanalysis-era5-single-levels',
                {
                    'product_type': 'reanalysis',
                    'format': 'grib',
                    'variable': [
                        '2m_temperature', '2m_dewpoint_temperature', '10m_u_component_of_wind',
                        '10m_v_component_of_wind', 'boundary_layer_height', 'total_precipitation',
                    ],
                    'year': year,
                    'month': month,
                    'day': [f"{day:02d}" for day in range(1, end_date.day + 1)],
                    'time': [f"{h:02d}:00" for h in range(24)],
                    'area': [lat_max, lon_min, lat_min, lon_max],
                },
                download_target_file)
            print(f"Successfully downloaded {download_target_file}")

        except Exception as e:
            print(f"An error occurred while downloading {year}-{month}: {e}")
            # Clean up partially downloaded file if it exists
            if os.path.exists(download_target_file):
                os.remove(download_target_file)

station_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\station_info3_wgs84.csv"
output_dir = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5"
download_era5_batches(station_file, output_dir)

Reading station locations to define download area...


2025-08-27 21:15:03,230 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.



--- Requesting data for 2022-09 ---


2025-08-27 21:15:03,805 INFO Request ID is d6187f75-d917-4a0f-beb8-61898d6faaf6
2025-08-27 21:15:03,993 INFO status has been updated to accepted
2025-08-27 21:15:12,389 INFO status has been updated to running
2025-08-27 21:15:17,529 INFO status has been updated to successful


33a52a8a0fae58188ae883a379cf59de.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2022_09.grib

--- Requesting data for 2022-10 ---


2025-08-27 21:15:18,637 INFO Request ID is 4e43e578-6fe5-4c95-bf55-48e13232000b
2025-08-27 21:15:18,716 INFO status has been updated to accepted
2025-08-27 21:15:32,286 INFO status has been updated to running
2025-08-27 21:15:39,940 INFO status has been updated to successful


f5d48a0b1a3c8a547a424ca403a7dc95.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2022_10.grib

--- Requesting data for 2022-11 ---


2025-08-27 21:15:40,952 INFO Request ID is 608f668c-6cf3-4136-9f38-381b2f80fc68
2025-08-27 21:15:41,277 INFO status has been updated to accepted
2025-08-27 21:16:02,400 INFO status has been updated to running
2025-08-27 21:19:59,817 INFO status has been updated to successful


fba514c18230ffb7239f607b14c05891.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2022_11.grib

--- Requesting data for 2022-12 ---


2025-08-27 21:20:00,892 INFO Request ID is ed47ab7b-e539-491c-94aa-051c0ef74d6d
2025-08-27 21:20:01,023 INFO status has been updated to accepted
2025-08-27 21:20:09,563 INFO status has been updated to running
2025-08-27 21:24:20,141 INFO status has been updated to successful


8478a40063d09d88dd56ca85d1bb6f5.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2022_12.grib

--- Requesting data for 2023-01 ---


2025-08-27 21:24:21,232 INFO Request ID is 52653b6e-b79a-4340-b398-5bab8e30411a
2025-08-27 21:24:21,299 INFO status has been updated to accepted
2025-08-27 21:24:34,954 INFO status has been updated to running
2025-08-27 21:28:39,982 INFO status has been updated to successful


5a5a17505dad96f69b76a1f4ecb7b562.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_01.grib

--- Requesting data for 2023-02 ---


2025-08-27 21:28:41,012 INFO Request ID is b0cd505c-6d14-4167-a7cb-195d61e0d359
2025-08-27 21:28:41,104 INFO status has been updated to accepted
2025-08-27 21:29:14,120 INFO status has been updated to running
2025-08-27 21:33:00,182 INFO status has been updated to successful


c054f7a44113edace2c2b14c86933eda.grib:   0%|          | 0.00/457k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_02.grib

--- Requesting data for 2023-03 ---


2025-08-27 21:33:01,247 INFO Request ID is a8ea6f8f-8c72-459b-bb2d-58f397aa59ca
2025-08-27 21:33:01,330 INFO status has been updated to accepted
2025-08-27 21:33:22,576 INFO status has been updated to running
2025-08-27 21:37:20,018 INFO status has been updated to successful


28bc5532911922d86e85d5c01d278ffb.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_03.grib

--- Requesting data for 2023-04 ---


2025-08-27 21:37:21,139 INFO Request ID is e7f682f9-4f7a-4c64-9b47-059d79738008
2025-08-27 21:37:21,265 INFO status has been updated to accepted
2025-08-27 21:37:29,712 INFO status has been updated to running
2025-08-27 21:41:40,185 INFO status has been updated to successful


1f54c87d7f1cc317f395716793cc3040.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_04.grib

--- Requesting data for 2023-05 ---


2025-08-27 21:41:42,345 INFO Request ID is f544fee3-1ba9-463b-afc5-dcdc0816e772
2025-08-27 21:41:42,463 INFO status has been updated to accepted
2025-08-27 21:42:03,704 INFO status has been updated to running
2025-08-27 21:46:01,322 INFO status has been updated to successful


90bf1dfe4960dc634332d82f90642a94.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_05.grib

--- Requesting data for 2023-06 ---


2025-08-27 21:46:04,730 INFO Request ID is 35135c02-1ab2-4cf6-ba7f-0a786c8f7b47
2025-08-27 21:46:04,804 INFO status has been updated to accepted
2025-08-27 21:46:26,179 INFO status has been updated to running
2025-08-27 21:52:23,855 INFO status has been updated to successful


67317dc97d39302f0f5b6795b96fb050.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_06.grib

--- Requesting data for 2023-07 ---


2025-08-27 21:52:25,404 INFO Request ID is 0e09d624-2b0e-44dc-a2b2-c857a0eb60b3
2025-08-27 21:52:25,477 INFO status has been updated to accepted
2025-08-27 21:52:39,296 INFO status has been updated to running
2025-08-27 21:56:44,381 INFO status has been updated to successful


ed43b8fe98e4b69be6c571ebc1df169c.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_07.grib

--- Requesting data for 2023-08 ---


2025-08-27 21:56:46,273 INFO Request ID is b24fc4a4-7f43-4dae-8664-c5984429a7d7
2025-08-27 21:56:46,483 INFO status has been updated to accepted
2025-08-27 21:57:07,766 INFO status has been updated to running
2025-08-27 22:01:05,148 INFO status has been updated to successful


9eb695fe8a7923dc4701f07546fc838a.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_08.grib

--- Requesting data for 2023-09 ---


2025-08-27 22:01:07,088 INFO Request ID is 1eff0f4f-dc7d-4739-8793-0a0084ff945c
2025-08-27 22:01:07,167 INFO status has been updated to accepted
2025-08-27 22:01:21,047 INFO status has been updated to running
2025-08-27 22:09:26,731 INFO status has been updated to successful


2154696bdfd142b31707328b7c1ab7f7.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_09.grib

--- Requesting data for 2023-10 ---


2025-08-27 22:09:27,935 INFO Request ID is 1ea9c3c9-9dce-40ad-b215-81d7b67e4e32
2025-08-27 22:09:28,080 INFO status has been updated to accepted
2025-08-27 22:09:36,623 INFO status has been updated to running
2025-08-27 22:13:47,057 INFO status has been updated to successful


40f2c5b408b74bd2d194a641f3bfe514.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_10.grib

--- Requesting data for 2023-11 ---


2025-08-27 22:13:48,292 INFO Request ID is a8b5cb36-1247-45b2-b3d8-4664485b11d7
2025-08-27 22:13:48,372 INFO status has been updated to accepted
2025-08-27 22:14:21,232 INFO status has been updated to running
2025-08-27 22:18:07,191 INFO status has been updated to successful


5784a7ac32fd9f3f260bf9510101fe7e.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_11.grib

--- Requesting data for 2023-12 ---


2025-08-27 22:18:08,688 INFO Request ID is 613cd549-1b1b-4397-af6c-48020f11eb80
2025-08-27 22:18:08,768 INFO status has been updated to accepted
2025-08-27 22:18:17,176 INFO status has been updated to running
2025-08-27 22:22:27,783 INFO status has been updated to successful


970aa430326957e0f98c5738166cec14.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2023_12.grib

--- Requesting data for 2024-01 ---


2025-08-27 22:22:28,814 INFO Request ID is 28910d08-b102-40cb-a38f-3295a0142dc9
2025-08-27 22:22:28,887 INFO status has been updated to accepted
2025-08-27 22:22:42,403 INFO status has been updated to running
2025-08-27 22:24:22,918 INFO status has been updated to accepted
2025-08-27 22:25:20,775 INFO status has been updated to running
2025-08-27 22:28:47,661 INFO status has been updated to successful


e61d700719f2e9cd18871e2998bbd561.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_01.grib

--- Requesting data for 2024-02 ---


2025-08-27 22:28:48,896 INFO Request ID is 8f44c870-19da-4b1e-aaa8-0baca0978de0
2025-08-27 22:28:48,977 INFO status has been updated to accepted
2025-08-27 22:29:02,821 INFO status has been updated to running
2025-08-27 22:33:07,948 INFO status has been updated to successful


fa8e37d81369e647a12f261d288cc34f.grib:   0%|          | 0.00/473k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_02.grib

--- Requesting data for 2024-03 ---


2025-08-27 22:33:09,032 INFO Request ID is 02598bfe-ca48-40a4-bae2-32a463dfe766
2025-08-27 22:33:09,109 INFO status has been updated to accepted
2025-08-27 22:33:22,646 INFO status has been updated to running
2025-08-27 22:37:27,797 INFO status has been updated to successful


f75cd3a0cdd3b0251ec814b643e236d9.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_03.grib

--- Requesting data for 2024-04 ---


2025-08-27 22:37:28,976 INFO Request ID is 62f5f208-c51c-4031-8225-c3b5a970ac9e
2025-08-27 22:37:29,061 INFO status has been updated to accepted
2025-08-27 22:37:42,616 INFO status has been updated to running
2025-08-27 22:41:47,873 INFO status has been updated to successful


92fae8de809a6c4e825a3b68c2007f55.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_04.grib

--- Requesting data for 2024-05 ---


2025-08-27 22:41:49,393 INFO Request ID is bf54f6b1-ab90-46bd-b3b8-92e4001bc9fa
2025-08-27 22:41:49,591 INFO status has been updated to accepted
2025-08-27 22:42:03,131 INFO status has been updated to running
2025-08-27 22:46:08,210 INFO status has been updated to successful


e846f694064c9130fe6df5ba18e6e9ba.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_05.grib

--- Requesting data for 2024-06 ---


2025-08-27 22:46:09,521 INFO Request ID is cf6e77ac-f69d-4047-b43c-63d535aec3bb
2025-08-27 22:46:09,593 INFO status has been updated to accepted
2025-08-27 22:46:23,233 INFO status has been updated to running
2025-08-27 22:49:01,733 INFO status has been updated to successful


b5445790fd62b4420c225dec32ba3a2c.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_06.grib

--- Requesting data for 2024-07 ---


2025-08-27 22:49:02,773 INFO Request ID is a0c63e2e-b59d-4383-8068-28b68afd5f6d
2025-08-27 22:49:02,841 INFO status has been updated to accepted
2025-08-27 22:49:16,472 INFO status has been updated to running
2025-08-27 22:55:22,336 INFO status has been updated to successful


c81efd46fe0e62e67ae5ad6698664f58.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_07.grib

--- Requesting data for 2024-08 ---


2025-08-27 22:55:23,632 INFO Request ID is 8125a5bb-9ff1-40b4-a412-fd2ce07f012c
2025-08-27 22:55:23,772 INFO status has been updated to accepted
2025-08-27 22:55:37,783 INFO status has been updated to running
2025-08-27 23:01:43,194 INFO status has been updated to successful


8d5e33532d8eb0087446b18c1e2adc08.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_08.grib

--- Requesting data for 2024-09 ---


2025-08-27 23:01:44,448 INFO Request ID is 53be4a05-70d4-49c7-81b7-ac64e8136ae4
2025-08-27 23:01:44,559 INFO status has been updated to accepted
2025-08-27 23:01:58,235 INFO status has been updated to running
2025-08-27 23:06:03,437 INFO status has been updated to successful


b117ec3944e67c3ffec65d00e11a52c3.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_09.grib

--- Requesting data for 2024-10 ---


2025-08-27 23:06:04,797 INFO Request ID is cfd648dd-e437-4b79-b055-88da7f4afadd
2025-08-27 23:06:04,880 INFO status has been updated to accepted
2025-08-27 23:06:13,417 INFO status has been updated to running
2025-08-27 23:10:23,922 INFO status has been updated to successful


dee85533da2114c9dff151a249ee34d3.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_10.grib

--- Requesting data for 2024-11 ---


2025-08-27 23:10:24,971 INFO Request ID is 8867bfe7-df46-4cd9-a457-6a3538ac5f24
2025-08-27 23:10:25,046 INFO status has been updated to accepted
2025-08-27 23:10:38,760 INFO status has been updated to running
2025-08-27 23:16:44,606 INFO status has been updated to successful


196dd5fc463e9fa49bfdcbc6bb3529e0.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_11.grib

--- Requesting data for 2024-12 ---


2025-08-27 23:16:45,828 INFO Request ID is bbc96136-e668-4f01-a47f-99702f665874
2025-08-27 23:16:45,908 INFO status has been updated to accepted
2025-08-27 23:16:54,504 INFO status has been updated to running
2025-08-27 23:21:04,774 INFO status has been updated to successful


ca32636ee0cabed60e699db87df76900.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2024_12.grib

--- Requesting data for 2025-01 ---


2025-08-27 23:21:05,924 INFO Request ID is 4b201872-28cf-4ab6-86e0-7ea94e13d3d1
2025-08-27 23:21:06,025 INFO status has been updated to accepted
2025-08-27 23:21:19,861 INFO status has been updated to running
2025-08-27 23:25:25,214 INFO status has been updated to successful


5b5a60c9bc169358f7e61ba31dc78c04.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2025_01.grib

--- Requesting data for 2025-02 ---


2025-08-27 23:25:26,323 INFO Request ID is a44a6c2e-ceb2-4828-928f-1857959bdb17
2025-08-27 23:25:26,755 INFO status has been updated to accepted
2025-08-27 23:25:35,226 INFO status has been updated to running
2025-08-27 23:29:45,470 INFO status has been updated to successful


373f4f3a56157c678ade8731d4b17c1e.grib:   0%|          | 0.00/457k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2025_02.grib

--- Requesting data for 2025-03 ---


2025-08-27 23:29:46,731 INFO Request ID is 8000b1cd-9d3b-4e8f-a24e-a929b58951fc
2025-08-27 23:29:46,896 INFO status has been updated to accepted
2025-08-27 23:30:08,396 INFO status has been updated to running
2025-08-27 23:34:06,083 INFO status has been updated to successful


ce04ca650cdfc560d0a577046f0b85b2.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2025_03.grib

--- Requesting data for 2025-04 ---


2025-08-27 23:34:07,387 INFO Request ID is 947825de-31b6-41ab-8e40-41bce492b1cb
2025-08-27 23:34:07,561 INFO status has been updated to accepted
2025-08-27 23:34:15,975 INFO status has been updated to running
2025-08-27 23:40:26,423 INFO status has been updated to successful


f3fe4314d1687e7d05ec769a12f4c4ca.grib:   0%|          | 0.00/489k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2025_04.grib

--- Requesting data for 2025-05 ---


2025-08-27 23:40:27,588 INFO Request ID is 4d08b942-4ac6-4ad4-a042-a9cb5565a35b
2025-08-27 23:40:27,724 INFO status has been updated to accepted
2025-08-27 23:40:36,445 INFO status has been updated to running
2025-08-27 23:44:46,801 INFO status has been updated to successful


a17e0d63149b90af909a3c3a81ab0b6.grib:   0%|          | 0.00/506k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\2025_05.grib


In [18]:
# Second downloader for correct precip, SI, and P
import cdsapi
import pandas as pd
import os

def download_era5_land_data(stations_filepath, output_dir):
    """
    Downloads ERA5-Land hourly data (precipitation, pressure, solar irradiance)
    in monthly GRIB files.

    Args:
        stations_filepath (str): Path to the CSV file with station info.
        output_dir (str): The directory where the downloaded .grib files will be saved.
    """
    # --- 1. Setup and Configuration ---
    print("Reading station locations to define download area...")
    try:
        stations_df = pd.read_csv(stations_filepath)
    except FileNotFoundError:
        print(f"Error: Station file not found at '{stations_filepath}'")
        return

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    # Define the geographical area of interest with a small buffer
    lat_min, lat_max = stations_df['latitude'].min() - 0.1, stations_df['latitude'].max() + 0.1
    lon_min, lon_max = stations_df['longitude'].min() - 0.1, stations_df['longitude'].max() + 0.1
    
    # Define the date range for batch processing
    date_range = pd.date_range(start='2022-09-01', end='2025-06-01', freq='MS')

    # Initialize the CDS API client
    c = cdsapi.Client()

    # --- 2. Loop Through Months and Download Data ---
    for i in range(len(date_range) - 1):
        start_date = date_range[i]
        end_date = date_range[i+1] - pd.Timedelta(days=1)
        year = str(start_date.year)
        month = f"{start_date.month:02d}"
        
        download_target_file = os.path.join(output_dir, f"era5_land_data_{year}_{month}.grib")
        
        if os.path.exists(download_target_file):
            print(f"Data file for {year}-{month} already exists. Skipping.")
            continue

        print(f"\n--- Requesting ERA5-Land data for {year}-{month} ---")
        
        try:
            c.retrieve(
                'reanalysis-era5-land', # Using the ERA5-Land dataset
                {
                    'format': 'grib',
                    'variable': [
                        'surface_pressure', 'surface_solar_radiation_downwards', 'total_precipitation',
                    ],
                    'year': year,
                    'month': month,
                    'day': [f"{day:02d}" for day in range(1, end_date.day + 1)],
                    'time': [f"{h:02d}:00" for h in range(24)],
                    'area': [lat_max, lon_min, lat_min, lon_max],
                },
                download_target_file)
            print(f"Successfully downloaded {download_target_file}")

        except Exception as e:
            print(f"An error occurred while downloading data for {year}-{month}: {e}")
            if os.path.exists(download_target_file):
                os.remove(download_target_file)

# --- Example Usage ---
if __name__ == '__main__':
    station_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\station_info3_wgs84.csv"
    raw_data_dir = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5"
    
    download_era5_land_data(station_file, raw_data_dir)
    print("\nERA5-Land data download process complete.")


Reading station locations to define download area...


2025-08-28 01:01:41,384 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.



--- Requesting ERA5-Land data for 2022-09 ---


2025-08-28 01:01:42,089 INFO Request ID is b472f6cc-6586-48bf-816c-3a9d9215559c
2025-08-28 01:01:42,197 INFO status has been updated to accepted
2025-08-28 01:02:15,179 INFO status has been updated to running
2025-08-28 01:06:01,135 INFO status has been updated to successful


3d55b3d9c971d2eed8d4e4341da23f5b.zip:   0%|          | 0.00/81.0k [00:00<?, ?B/s]

2025-08-28 01:06:02,044 INFO Request ID is 3e6d9ab1-bc3d-4570-8bc7-b51695808584


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2022_09.grib

--- Requesting ERA5-Land data for 2022-10 ---


2025-08-28 01:06:02,118 INFO status has been updated to accepted
2025-08-28 01:06:10,531 INFO status has been updated to running
2025-08-28 01:16:21,469 INFO status has been updated to successful


b5ac643a10a21a25d13b1ff61ec0335e.zip:   0%|          | 0.00/71.6k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2022_10.grib

--- Requesting ERA5-Land data for 2022-11 ---


2025-08-28 01:16:22,241 INFO Request ID is 4cbde8b8-2e34-44e9-9ac4-86e95d586546
2025-08-28 01:16:22,319 INFO status has been updated to accepted
2025-08-28 01:16:27,469 INFO status has been updated to running
2025-08-28 01:22:41,506 INFO status has been updated to successful


af7d7b1297de106250cf19dc68b14e48.zip:   0%|          | 0.00/79.1k [00:00<?, ?B/s]

2025-08-28 01:22:42,358 INFO Request ID is 7f867d10-097b-42d6-96f6-75e732feddac


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2022_11.grib

--- Requesting ERA5-Land data for 2022-12 ---


2025-08-28 01:22:42,432 INFO status has been updated to accepted
2025-08-28 01:23:15,253 INFO status has been updated to running
2025-08-28 01:29:01,714 INFO status has been updated to successful


192c81becfacaa3434701e33a1d2eeaa.zip:   0%|          | 0.00/79.5k [00:00<?, ?B/s]

2025-08-28 01:29:02,562 INFO Request ID is 08d12b00-99c1-46b4-aa03-37c89ab0faec


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2022_12.grib

--- Requesting ERA5-Land data for 2023-01 ---


2025-08-28 01:29:02,633 INFO status has been updated to accepted
2025-08-28 01:29:11,021 INFO status has been updated to running
2025-08-28 01:35:21,540 INFO status has been updated to successful


52cb4b66638bec8c4d5e3582bd242371.zip:   0%|          | 0.00/86.4k [00:00<?, ?B/s]

2025-08-28 01:35:22,524 INFO Request ID is 82fa0596-1655-4b0e-9adc-496fa52c49c1


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_01.grib

--- Requesting ERA5-Land data for 2023-02 ---


2025-08-28 01:35:22,611 INFO status has been updated to accepted
2025-08-28 01:35:36,158 INFO status has been updated to running
2025-08-28 01:39:41,450 INFO status has been updated to successful


4f27d83d9c8edafc9a49351b7ba3d278.zip:   0%|          | 0.00/73.0k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_02.grib

--- Requesting ERA5-Land data for 2023-03 ---


2025-08-28 01:39:42,499 INFO Request ID is af17d461-28f6-4a11-b6ec-6abe47cf86b9
2025-08-28 01:39:42,583 INFO status has been updated to accepted
2025-08-28 01:40:03,862 INFO status has been updated to running
2025-08-28 01:46:01,562 INFO status has been updated to successful


1d8b35332ed9363ecf1b17da2eca710d.zip:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_03.grib

--- Requesting ERA5-Land data for 2023-04 ---


2025-08-28 01:46:02,586 INFO Request ID is 91def682-8161-4f4f-b498-e367c6bfbf90
2025-08-28 01:46:02,664 INFO status has been updated to accepted
2025-08-28 01:46:16,195 INFO status has been updated to running
2025-08-28 01:52:21,533 INFO status has been updated to successful


670b8d0173443b1443a3284fa6f1bf0e.zip:   0%|          | 0.00/91.6k [00:00<?, ?B/s]

2025-08-28 01:52:22,295 INFO Request ID is b960fbf9-8626-4a5f-ae13-66a7658a09c8


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_04.grib

--- Requesting ERA5-Land data for 2023-05 ---


2025-08-28 01:52:22,364 INFO status has been updated to accepted
2025-08-28 01:52:35,910 INFO status has been updated to running
2025-08-28 01:58:41,478 INFO status has been updated to successful


ca1cbefe24eac722d52180b2babdd043.zip:   0%|          | 0.00/94.7k [00:00<?, ?B/s]

2025-08-28 01:58:42,385 INFO Request ID is 32ded1c2-c457-461f-be28-6ede411e69df


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_05.grib

--- Requesting ERA5-Land data for 2023-06 ---


2025-08-28 01:58:42,504 INFO status has been updated to accepted
2025-08-28 01:58:56,315 INFO status has been updated to running
2025-08-28 02:05:01,776 INFO status has been updated to successful


9f8bc05cad6e111a834e1f3093bf48b.zip:   0%|          | 0.00/91.9k [00:00<?, ?B/s]

2025-08-28 02:05:02,548 INFO Request ID is 8ac97c1f-e66c-49ff-9f6c-0a147ca1bc85


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_06.grib

--- Requesting ERA5-Land data for 2023-07 ---


2025-08-28 02:05:02,955 INFO status has been updated to accepted
2025-08-28 02:05:16,497 INFO status has been updated to running
2025-08-28 02:11:21,966 INFO status has been updated to successful


95a408dcfa347807904d1abdc99ae6ba.zip:   0%|          | 0.00/83.6k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_07.grib

--- Requesting ERA5-Land data for 2023-08 ---


2025-08-28 02:11:22,772 INFO Request ID is 2b90125b-627d-48fd-bee5-ae5279feb283
2025-08-28 02:11:22,849 INFO status has been updated to accepted
2025-08-28 02:11:36,458 INFO status has been updated to running
2025-08-28 02:17:41,767 INFO status has been updated to successful


f48e3a013a527cedab5d94740db7fd9b.zip:   0%|          | 0.00/80.1k [00:00<?, ?B/s]

2025-08-28 02:17:42,592 INFO Request ID is 17c1c447-baf5-4264-8c02-b370776116b3


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_08.grib

--- Requesting ERA5-Land data for 2023-09 ---


2025-08-28 02:17:42,667 INFO status has been updated to accepted
2025-08-28 02:17:51,122 INFO status has been updated to running
2025-08-28 02:26:02,247 INFO status has been updated to successful


37e0c4f1363e48a40b91b551a1d89b0b.zip:   0%|          | 0.00/78.1k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_09.grib

--- Requesting ERA5-Land data for 2023-10 ---


2025-08-28 02:26:03,121 INFO Request ID is af053efa-c95c-4e75-856b-efa52ca1ff45
2025-08-28 02:26:03,191 INFO status has been updated to accepted
2025-08-28 02:26:11,831 INFO status has been updated to running
2025-08-28 02:32:22,255 INFO status has been updated to successful


75a0d71a169e0dfc14193475bcb9916b.zip:   0%|          | 0.00/71.6k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_10.grib

--- Requesting ERA5-Land data for 2023-11 ---


2025-08-28 02:32:23,141 INFO Request ID is edacf202-f72d-4019-84b3-a3274c29d337
2025-08-28 02:32:23,220 INFO status has been updated to accepted
2025-08-28 02:32:36,917 INFO status has been updated to running
2025-08-28 02:38:42,455 INFO status has been updated to successful


3a689a3298ffb5be15c0752d4a2d52a1.zip:   0%|          | 0.00/80.2k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_11.grib

--- Requesting ERA5-Land data for 2023-12 ---


2025-08-28 02:38:43,363 INFO Request ID is 40c66300-2e52-4858-863e-27741b9f6ae3
2025-08-28 02:38:43,452 INFO status has been updated to accepted
2025-08-28 02:38:57,322 INFO status has been updated to running
2025-08-28 02:45:02,785 INFO status has been updated to successful


a9dc4357aa52c4655ea82ca4a720c92a.zip:   0%|          | 0.00/73.9k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2023_12.grib

--- Requesting ERA5-Land data for 2024-01 ---


2025-08-28 02:45:03,801 INFO Request ID is 00947ca8-c917-4667-a4da-0419cd8eb66c
2025-08-28 02:45:03,893 INFO status has been updated to accepted
2025-08-28 02:45:17,514 INFO status has been updated to running
2025-08-28 02:51:22,970 INFO status has been updated to successful


180e06ebb5833d4eb6219a72831aa92b.zip:   0%|          | 0.00/82.4k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_01.grib

--- Requesting ERA5-Land data for 2024-02 ---


2025-08-28 02:51:23,781 INFO Request ID is 8535eb39-8ed0-4b7c-b359-677ed70b5496
2025-08-28 02:51:23,935 INFO status has been updated to accepted
2025-08-28 02:51:37,547 INFO status has been updated to running
2025-08-28 02:57:43,056 INFO status has been updated to successful


6ba9600d0d6e4c88c2c25c6f107d2235.zip:   0%|          | 0.00/71.6k [00:00<?, ?B/s]

2025-08-28 02:57:43,911 INFO Request ID is 96847a8b-e714-4094-b6df-497579337d58


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_02.grib

--- Requesting ERA5-Land data for 2024-03 ---


2025-08-28 02:57:43,987 INFO status has been updated to accepted
2025-08-28 02:58:16,873 INFO status has been updated to running
2025-08-28 03:04:03,008 INFO status has been updated to successful


838d6e43dbd65ddaec041951a00b0af1.zip:   0%|          | 0.00/88.3k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_03.grib

--- Requesting ERA5-Land data for 2024-04 ---


2025-08-28 03:04:04,180 INFO Request ID is 0f3c3600-0112-49cf-8267-b04c32fcb225
2025-08-28 03:04:04,259 INFO status has been updated to accepted
2025-08-28 03:04:37,402 INFO status has been updated to running
2025-08-28 03:10:23,886 INFO status has been updated to successful


67d489ce7e5d8127800283577de8b429.zip:   0%|          | 0.00/77.4k [00:00<?, ?B/s]

2025-08-28 03:10:24,670 INFO Request ID is 710d4d8c-4cd0-4063-850a-255637c92e80


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_04.grib

--- Requesting ERA5-Land data for 2024-05 ---


2025-08-28 03:10:24,760 INFO status has been updated to accepted
2025-08-28 03:10:33,211 INFO status has been updated to running
2025-08-28 03:16:44,124 INFO status has been updated to successful


9bb8d7afa285ec11b4c1c2b3c627732b.zip:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_05.grib

--- Requesting ERA5-Land data for 2024-06 ---


2025-08-28 03:16:45,162 INFO Request ID is 15ba8223-6812-4b0e-a3e0-0cf8e338a872
2025-08-28 03:16:45,242 INFO status has been updated to accepted
2025-08-28 03:17:06,778 INFO status has been updated to running
2025-08-28 03:23:04,530 INFO status has been updated to successful


5f25078ffb0be117bfbbf634a2258bfb.zip:   0%|          | 0.00/80.8k [00:00<?, ?B/s]

2025-08-28 03:23:05,405 INFO Request ID is 5633df61-a876-4bb4-81b6-583ff1e308d8


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_06.grib

--- Requesting ERA5-Land data for 2024-07 ---


2025-08-28 03:23:05,489 INFO status has been updated to accepted
2025-08-28 03:23:19,055 INFO status has been updated to running
2025-08-28 03:29:24,380 INFO status has been updated to successful


b935c86932ce63377d7bada929ac58df.zip:   0%|          | 0.00/75.3k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_07.grib

--- Requesting ERA5-Land data for 2024-08 ---


2025-08-28 03:29:25,237 INFO Request ID is b41a21b1-8c7a-4348-acf0-386fd17126ab
2025-08-28 03:29:25,315 INFO status has been updated to accepted
2025-08-28 03:29:33,805 INFO status has been updated to running
2025-08-28 03:35:44,394 INFO status has been updated to successful


a4690d5cd64b3da31294e7097201d21a.zip:   0%|          | 0.00/76.2k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_08.grib

--- Requesting ERA5-Land data for 2024-09 ---


2025-08-28 03:35:45,332 INFO Request ID is 9b37ecdc-aae7-4c37-bc8a-0ece6565283e
2025-08-28 03:35:45,410 INFO status has been updated to accepted
2025-08-28 03:35:58,940 INFO status has been updated to running
2025-08-28 03:36:06,599 INFO status has been updated to accepted
2025-08-28 03:36:18,055 INFO status has been updated to running
2025-08-28 03:42:04,182 INFO status has been updated to successful


ceb54225f4a34c2f017f202d16d33955.zip:   0%|          | 0.00/77.9k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_09.grib

--- Requesting ERA5-Land data for 2024-10 ---


2025-08-28 03:42:05,051 INFO Request ID is 632198b7-f730-4e8b-8bc1-490d0222ec99
2025-08-28 03:42:05,222 INFO status has been updated to accepted
2025-08-28 03:42:18,819 INFO status has been updated to running
2025-08-28 03:48:24,254 INFO status has been updated to successful


f214788ea558cad9787dea77ec3dbc9e.zip:   0%|          | 0.00/68.6k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_10.grib

--- Requesting ERA5-Land data for 2024-11 ---


2025-08-28 03:48:25,302 INFO Request ID is d9563758-bd04-4dfc-b7f9-001edfac72b1
2025-08-28 03:48:25,377 INFO status has been updated to accepted
2025-08-28 03:48:38,953 INFO status has been updated to running
2025-08-28 03:54:44,380 INFO status has been updated to successful


1152eaaff877b15c973e9378e6608532.zip:   0%|          | 0.00/74.5k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_11.grib

--- Requesting ERA5-Land data for 2024-12 ---


2025-08-28 03:54:45,303 INFO Request ID is c6b2742c-7b87-4b36-aec4-04d7dbd463ba
2025-08-28 03:54:45,394 INFO status has been updated to accepted
2025-08-28 03:54:59,143 INFO status has been updated to running
2025-08-28 04:01:04,535 INFO status has been updated to successful


4e56a7e0452f886674d9c4bb5df8479a.zip:   0%|          | 0.00/84.6k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2024_12.grib

--- Requesting ERA5-Land data for 2025-01 ---


2025-08-28 04:01:05,335 INFO Request ID is 9749ccbd-513c-42fb-a44c-5777d9ae2df4
2025-08-28 04:01:05,413 INFO status has been updated to accepted
2025-08-28 04:01:19,162 INFO status has been updated to running
2025-08-28 04:07:24,458 INFO status has been updated to successful


23cda9264710f9c98bb1dc787f13e7.zip:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2025_01.grib

--- Requesting ERA5-Land data for 2025-02 ---


2025-08-28 04:07:25,328 INFO Request ID is fd15e8c1-cc47-4003-81a6-00682c25aef9
2025-08-28 04:07:25,446 INFO status has been updated to accepted
2025-08-28 04:07:39,045 INFO status has been updated to running
2025-08-28 04:13:44,800 INFO status has been updated to successful


e1e1da680d2c74564bcee8b66a19345.zip:   0%|          | 0.00/75.9k [00:00<?, ?B/s]

2025-08-28 04:13:45,580 INFO Request ID is fc8fdd8e-f4b7-4280-8f00-182850759293


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2025_02.grib

--- Requesting ERA5-Land data for 2025-03 ---


2025-08-28 04:13:45,822 INFO status has been updated to accepted
2025-08-28 04:14:06,979 INFO status has been updated to running
2025-08-28 04:20:04,764 INFO status has been updated to successful


909c01da80af421da430c8ebdaaffd09.zip:   0%|          | 0.00/82.1k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2025_03.grib

--- Requesting ERA5-Land data for 2025-04 ---


2025-08-28 04:20:05,918 INFO Request ID is 400d208c-3e13-496b-824a-ac9b5d19d9a3
2025-08-28 04:20:06,001 INFO status has been updated to accepted
2025-08-28 04:20:38,708 INFO status has been updated to running
2025-08-28 04:26:24,964 INFO status has been updated to successful


fd1756f5ea2fab8714e9fb633777a1d2.zip:   0%|          | 0.00/85.2k [00:00<?, ?B/s]

2025-08-28 04:26:25,715 INFO Request ID is 111fd814-95dc-4e9e-bbb2-be2b3ba6405a


Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2025_04.grib

--- Requesting ERA5-Land data for 2025-05 ---


2025-08-28 04:26:25,786 INFO status has been updated to accepted
2025-08-28 04:26:39,334 INFO status has been updated to running
2025-08-28 04:32:44,888 INFO status has been updated to successful


d9639e32d23b2f3ad0125afb8d652a04.zip:   0%|          | 0.00/89.5k [00:00<?, ?B/s]

Successfully downloaded C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5\era5_land_data_2025_05.grib

ERA5-Land data download process complete.


In [None]:
# Process meteorology data (must be in conda env)
'''
-open anaconda prompt
set PYTHONNOUSERSITE=1
conda activate era5_env
cd C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr
python era5_processing.py
'''

import pandas as pd
import xarray as xr
import numpy as np
import os
from glob import glob
from scipy.interpolate import interpn

def process_era5_files(stations_filepath, grib_files_dir, output_filepath):
    # --- 1. Load Station Data ---
    print("Reading station locations...")
    try:
        stations_df = pd.read_csv(stations_filepath)
        if 'id' in stations_df.columns:
            stations_df = stations_df.rename(columns={'id': 'station_id'})
    except FileNotFoundError:
        print(f"Error: Station file not found at '{stations_filepath}'")
        return

    # Find all GRIB files in the specified directory
    grib_files = sorted(glob(os.path.join(grib_files_dir, "*.grib")))
    if not grib_files:
        print(f"Error: No .grib files found in '{grib_files_dir}'")
        return
    
    print(f"Found {len(grib_files)} GRIB files to process.")
    
    all_months_data = []

    # --- 2. Loop Through GRIB Files and Process ---
    for grib_file in grib_files:
        print(f"\n--- Processing {os.path.basename(grib_file)} ---")
        try:
            # Use xarray with cfgrib engine to open GRIB files
            ds = xr.open_dataset(grib_file, engine='cfgrib')

            # --- 3. Calculate Derived Variables ---
            t2m_c = ds['t2m'] - 273.15
            d2m_c = ds['d2m'] - 273.15
            es = 0.61094 * np.exp((17.625 * t2m_c) / (243.04 + t2m_c))
            e = 0.61094 * np.exp((17.625 * d2m_c) / (243.04 + d2m_c))
            rh = (e / es) * 100
            rh = rh.clip(0, 100)
            ws = np.sqrt(ds['u10']**2 + ds['v10']**2)
            wd = 180 + (180 / np.pi) * np.arctan2(ds['u10'], ds['v10'])

            # --- 4. Perform Bilinear Interpolation ---
            data_lats = ds.latitude.values
            data_lons = ds.longitude.values
            station_coords = list(zip(stations_df['latitude'], stations_df['longitude']))

            interpolated_data = {'time': ds.time.values}
            variables_to_interpolate = {
                'temperature_c': ds['t2m'] - 273.15, 'relative_humidity': rh,
                'precipitation_m': ds['tp'], 'wind_speed_ms': ws,
                'wind_direction_deg': wd, 'boundary_layer_height_m': ds['blh']
            }

            for var_name, var_data in variables_to_interpolate.items():
                print(f"Interpolating {var_name}...")
                # We need to get the coordinate values for interpn
                points = (var_data.coords['latitude'].values, var_data.coords['longitude'].values)
                # Loop through each time step to perform interpolation
                interp_results = []
                for t_step in range(len(var_data.time)):
                    values_at_time = var_data.isel(time=t_step).values
                    interp_values = interpn(points, values_at_time, station_coords, method='linear', bounds_error=False, fill_value=None)
                    interp_results.append(interp_values)
                interpolated_data[var_name] = np.array(interp_results)
            
            # --- 5. Format and Store Month's Data ---
            month_df = pd.DataFrame()
            for i, row in stations_df.iterrows():
                station_id = row['station_id']
                station_data = {'time': interpolated_data['time'], 'station_id': station_id}
                for var_name in variables_to_interpolate.keys():
                    station_data[var_name] = interpolated_data[var_name][:, i]
                
                temp_df = pd.DataFrame(station_data)
                month_df = pd.concat([month_df, temp_df], ignore_index=True)

            all_months_data.append(month_df)
            print(f"Finished processing {os.path.basename(grib_file)}.")

        except Exception as e:
            print(f"An error occurred while processing {grib_file}: {e}")

    # --- 6. Finalize and Save ---
    if all_months_data:
        print("\nCombining all monthly data and saving to CSV...")
        final_df = pd.concat(all_months_data, ignore_index=True)
        final_df = final_df.rename(columns={'time': 'measured_time'})
        final_df.to_csv(output_filepath, index=False)
        print(f"Successfully saved processed ERA5 data to {output_filepath}")
    else:
        print("No data was processed. Output file not created.")

# --- Example Usage ---
if __name__ == '__main__':
    station_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\station_info3_wgs84.csv"
    raw_data_directory = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5"
    output_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5_meteorology_processed.csv"
    
    # Run this script AFTER the download script has finished (or has at least downloaded some files)
    process_era5_files(station_file, raw_data_directory, output_file)

In [14]:
# Add temporal and meteorological data for modelling

def add_temporal_features(df, timestamp_col='measured_time'):
    """
    Adds insightful temporal features to a DataFrame based on a timestamp column.
    """
    print("Adding temporal features...")
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # Basic Temporal Features
    df['month'] = df[timestamp_col].dt.month
    df['dayofweek'] = df[timestamp_col].dt.dayofweek
    df['dayofyear'] = df[timestamp_col].dt.dayofyear
    df['hour'] = df[timestamp_col].dt.hour
    df['weekend'] = (df[timestamp_col].dt.dayofweek >= 5).astype(int)

    # Cyclical Feature Engineering
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7.0)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7.0)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12.0)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12.0)
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365.25)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365.25)

    print("Temporal features added successfully.")
    return df

def merge_data_for_modeling(residuals_filepath, meteorology_filepath, output_filepath):
    # --- 1. Load Input Datasets ---
    print(f"Loading pollutant residuals from: {residuals_filepath}")
    try:
        residuals_df = pd.read_csv(residuals_filepath)
        residuals_df['measured_time'] = pd.to_datetime(residuals_df['measured_time'], utc=True)
    except FileNotFoundError:
        print(f"Error: File not found at '{residuals_filepath}'. Please create this file first.")
        return

    print(f"Loading meteorology data from: {meteorology_filepath}")
    try:
        met_df = pd.read_csv(meteorology_filepath)
    except FileNotFoundError:
        print(f"Error: File not found at '{meteorology_filepath}'. Please create this file first.")
        return

    # --- 2. Add Temporal Features to Residuals ---
    # The add_temporal_features function also handles converting the time column
    residuals_with_time_df = add_temporal_features(residuals_df, 'measured_time')
    
    # --- 3. Prepare Meteorology Data for Merging ---
    # Ensure the time column is in datetime format for a reliable merge
    met_df['measured_time'] = pd.to_datetime(met_df['measured_time'], utc=True)
    # Remove bad columns
    met_df_filter = met_df.drop(columns = ['precipitation_m','boundary_layer_height_m'])

    # --- 4. Merge the DataFrames ---
    print("Merging temporalized residuals with meteorology data...")
    # We use a 'left' merge to ensure we keep all pollutant residual records.
    # Any timestamp in the residuals table that doesn't have a matching
    # meteorology record will have NaN for the met columns.
    final_df = pd.merge(
        residuals_with_time_df,
        met_df_filter,
        on=['measured_time', 'station_id'],
        how='left'
    )

    # --- 5. Final Checks and Save ---
    print(f"Merge complete. The final dataset has {len(final_df)} rows.")
    
    # Check for any rows that didn't get a successful merge
    missing_met_rows = final_df['temperature_c'].isnull().sum()
    if missing_met_rows > 0:
        print(f"Warning: {missing_met_rows} rows did not have matching meteorology data.")
    
    final_df.dropna(subset=['temperature_c'], inplace=True)

    # Final conversion
    final_df['wind_dir_sin'] = np.sin(2 * np.pi * final_df['wind_direction_deg'] / 360.0)
    final_df['wind_dir_cos'] = np.cos(2 * np.pi * final_df['wind_direction_deg'] / 360.0)
    columns_to_drop = [
    'month',
    'dayofweek',
    'dayofyear',
    'hour',
    'wind_direction_deg'
    ]
    df_cleaned = final_df.drop(columns=columns_to_drop)

    df_cleaned.to_csv(output_filepath, index=False)
    print(f"Successfully saved the final modeling dataset to: {output_filepath}")
    
    return df_cleaned

residuals_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\long_sensor_resids.csv" 
meteorology_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5_meteorology_processed.csv" 
final_output_file = r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\final_modeling_dataset.csv"

final_dataset = merge_data_for_modeling(residuals_file, meteorology_file, final_output_file)

print(final_dataset.info(memory_usage='deep'))

Loading pollutant residuals from: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\long_sensor_resids.csv
Loading meteorology data from: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\era5_meteorology_processed.csv
Adding temporal features...
Temporal features added successfully.
Merging temporalized residuals with meteorology data...
Merge complete. The final dataset has 629013 rows.
Successfully saved the final modeling dataset to: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\fr\final_modeling_dataset.csv
<class 'pandas.core.frame.DataFrame'>
Index: 628983 entries, 0 to 628982
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   measured_time      628983 non-null  datetime64[ns, UTC]
 1   station_id         628983 non-null  int64              
 2   NO2_resid          612098 non-null  float64            
 3   O3

<center>Step 6: Calculate final LOOCV and save results<center>

In [29]:
# Get final data from old df

old_df = pd.read_csv(r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\long_sensor_measurements5.csv")

def query_dataframe(df):
    """
    Creates a sample DataFrame and then filters and sorts it
    to match the logic of the provided SQL query.
    """
    # Create the sample DataFrame
    
    # --- Query Parameters ---
    start_date = '2024-06-01 00:00:00'
    end_date = '2025-05-31 23:59:59'
    measurements_to_fetch = ('O3', 'NO2', 'SO2', 'PM10', 'PM25')
    max_station_id = 27
    
    # --- Filter the DataFrame ---
    # The filtering logic is split into separate boolean masks for clarity.
    
    # 1. Filter by 'measurement' using the .isin() method
    measurement_mask = df['measurement'].isin(measurements_to_fetch)
    
    # 2. Filter by 'measured_time' using a date range
    time_mask = (df['measured_time'] >= start_date) & (df['measured_time'] <= end_date)
    
    # 3. Filter by 'station_id'
    station_mask = df['station_id'] <= max_station_id
    
    # Combine all masks to apply all filters at once
    filtered_df = df[measurement_mask & time_mask & station_mask]
    
    # --- Sort the DataFrame ---
    # The sort_values() method is used to order the data,
    # matching the SQL query's ORDER BY clause.
    sorted_df = filtered_df.sort_values(by=['measured_time', 'station_id'])
    
    # Reset the index after filtering and sorting for a clean result
    final_df = sorted_df.reset_index(drop=True)
    
    # --- Display the Result ---
    print("\nFiltered and Sorted DataFrame head:")
    print(final_df.head())
    print(f"\nTotal rows in final DataFrame: {len(final_df)}")
    return final_df

sensor_df = query_dataframe(old_df)


Filtered and Sorted DataFrame head:
              measured_time  station_id measurement  reading_value
0  2024-06-01 00:00:00+0000           1         NO2      29.406250
1  2024-06-01 00:00:00+0000           1          O3      34.500000
2  2024-06-01 00:00:00+0000           1        PM10      27.296875
3  2024-06-01 00:00:00+0000           1         SO2       4.101562
4  2024-06-01 00:00:00+0000           2         NO2      21.500000

Total rows in final DataFrame: 836979


In [27]:
# Get (final) data from my real-time db

import psycopg2
import pandas as pd
from psycopg2 import sql

def get_sensor_data():
    """
    Connects to the PostgreSQL database, retrieves sensor data for specific
    measurements within a date range, and returns it as a pandas DataFrame.

    Returns:
        pandas.DataFrame: A DataFrame containing the queried sensor data.
                          Returns None if an error occurs.
    """
    # --- Database Connection Details ---
    # !!! IMPORTANT: Replace these with your actual database credentials.
    DB_CONFIG = {
    "host": "localhost",
    "dbname": "platform_db",
    "user": "postgres",
    "password": "postgres"
    }

    conn = None
    try:
        # Establish the connection to the database
        print("Connecting to the PostgreSQL database...")
        conn = psycopg2.connect(**DB_CONFIG)
        print("Connection successful.")

        # --- Query Parameters ---
        start_date = '2024-06-01 00:00:00'
        end_date = '2025-05-31 23:59:59'
        measurements_to_fetch = ('O3', 'NO2', 'SO2', 'PM10', 'PM25')
        max_station_id = 27

        # --- Construct the SQL Query ---
        # Using sql.SQL for safe identifier quoting and placeholders for values
        # to prevent SQL injection.
        query = sql.SQL("""
            SELECT measured_time, station_id, measurement, reading_value
            FROM sensor_data
            WHERE measurement IN %s
            AND measured_time BETWEEN %s AND %s
            AND station_id <= %s
            ORDER BY measured_time, station_id;
        """)

        # Execute the query
        print("Executing query to fetch sensor data...")
        # Use a context manager for the cursor
        with conn.cursor() as cur:
            cur.execute(query, (measurements_to_fetch, start_date, end_date, max_station_id))

            # Fetch all the results
            results = cur.fetchall()

            # Get column names from the cursor description
            colnames = [desc[0] for desc in cur.description]

        print(f"Successfully fetched {len(results)} records.")

        # Create a pandas DataFrame from the results
        df = pd.DataFrame(results, columns=colnames)
        return df

    except psycopg2.Error as e:
        print(f"Database error: {e}")
        return None

    finally:
        # Make sure to close the connection
        if conn is not None:
            conn.close()
            print("Database connection closed.")


if __name__ == "__main__":
    # Call the function and get the data
    sensor_df = get_sensor_data()

    if sensor_df is not None:
        print("\n--- Sensor Data ---")
        # Display the first 5 rows of the DataFrame
        print("First 5 rows of the extracted data:")
        print(sensor_df.head())

        # Display some information about the DataFrame
        print("\nDataFrame Info:")
        sensor_df.info()
        #sensor_df.to_csv(r"C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\last_loocv\station_data.csv", index=False)

Connecting to the PostgreSQL database...
Connection successful.
Executing query to fetch sensor data...
Database connection closed.


KeyboardInterrupt: 

In [32]:
# View completeness
import plotly.graph_objects as go

def plot_completeness_by_parameter(sensor_df):
    """
    Pivots the data and plots the monthly data completeness for each station for a selected parameter.

    Args:
        sensor_df (pd.DataFrame): DataFrame containing the time series data from the database
                                      with 'measured_time', 'station_id', 'measurement', and 'reading_value'.
    """
    # Pivot the dataframe from long to wide format
    cleaned_data = sensor_df.pivot_table(
        index=['measured_time', 'station_id'],
        columns='measurement',
        values='reading_value'
    ).reset_index()

    # Rename columns to match original function's expectations
    cleaned_data.rename(columns={'measured_time': 'time', 'station_id': 'name'}, inplace=True)

    df_processed = cleaned_data.copy()
    df_processed['time'] = pd.to_datetime(df_processed['time'], utc=True)
    df_processed['month_year'] = df_processed['time'].dt.to_period('M')

    # Use the measurement types we fetched
    total_params = ['O3', 'NO2', 'SO2', 'PM10', 'PM25','WD']
    unique_stations = sorted(df_processed['name'].unique())

    fig = go.Figure()

    num_traces_per_param = len(unique_stations) + 2  # Stations + Mean + Median

    for i, param in enumerate(total_params):
        if param not in df_processed.columns:
            # If a parameter has no data at all, skip it.
            # And add empty traces to keep dropdown indexing correct
            for _ in range(num_traces_per_param):
                 fig.add_trace(go.Scatter(x=[], y=[], mode='lines', visible=(i==0)))
            continue

        # Calculate monthly completeness for the parameter
        monthly_completeness = df_processed.groupby(['month_year', 'name'], observed=False)[param].apply(lambda x: x.notnull().mean() * 100).reset_index()
        pivot_df = monthly_completeness.pivot(index='month_year', columns='name', values=param)
        pivot_df = pivot_df.reindex(columns=unique_stations, fill_value=0) # Ensure all stations are included
        pivot_df.index = pivot_df.index.astype(str)

        # Rank stations by overall completeness for the parameter
        station_overall_completeness = pivot_df.mean().sort_values()

        # Add traces for each station, ranked
        for station_name in station_overall_completeness.index:
            completeness_pct = station_overall_completeness[station_name]
            fig.add_trace(go.Scatter(
                x=pivot_df.index,
                y=pivot_df[station_name],
                name=f"Station {station_name} ({completeness_pct:.1f}%)",
                mode='lines',
                visible=(i == 0),
                line=dict(width=1.5),
                hoverinfo='x+y+name',
                showlegend=True
            ))

        # Calculate and add mean and median traces
        monthly_mean = pivot_df.mean(axis=1)
        monthly_median = pivot_df.median(axis=1)

        fig.add_trace(go.Scatter(
            x=monthly_mean.index, y=monthly_mean.values, name='Mean (All Stations)',
            mode='lines', line=dict(color='firebrick', width=3, dash='dash'),
            visible=(i == 0), hoverinfo='x+y'
        ))
        fig.add_trace(go.Scatter(
            x=monthly_median.index, y=monthly_median.values, name='Median (All Stations)',
            mode='lines', line=dict(color='black', width=4),
            visible=(i == 0), hoverinfo='x+y'
        ))

    # Create dropdown menu
    buttons = []
    for i, param in enumerate(total_params):
        visibility = [False] * (len(total_params) * num_traces_per_param)
        start_index = i * num_traces_per_param
        end_index = start_index + num_traces_per_param
        for j in range(start_index, end_index):
            visibility[j] = True

        button = dict(
            label=param,
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Monthly Data Completeness for {param}'}]
        )
        buttons.append(button)

    fig.update_layout(
        updatemenus=[dict(
            active=0,
            buttons=buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.01,
            xanchor="left",
            y=1.15,
            yanchor="top"
        )],
        title_text=f'Monthly Data Completeness for {total_params[0]}',
        xaxis_title='Month-Year',
        yaxis_title='Completeness (%)',
        yaxis_range=[-5, 105],
        legend_title_text='Stations (Ranked by Completeness)',
        hovermode='x unified',
        template='plotly_white',
        xaxis=dict(
            dtick="M1",
            tickformat="%b\n%Y",
            tickangle=0
        )
    )

    fig.show()

plot_completeness_by_parameter(sensor_df)



Converting to PeriodArray/Index representation will drop timezone information.



In [None]:
# Run evaluation script

import pandas as pd
import numpy as np
import re
import ast
import statsmodels.api as sm
from scipy.stats import skew
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNetCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.spatial import cKDTree
from joblib import Parallel, delayed
import time
import traceback
from dateutil.relativedelta import relativedelta

# --- Configuration ---
STATION_INFO_FILE = "data/station_info3.csv"
STATION_DATA_FILE = "data/long_sensor_measurements5.csv"
SPATIAL_IV_PATH = "loocv/iv_values_process4.csv"
METEO_DATA_FILE = "loocv/era5_meteorology_processed.csv"
OUTPUT_FILE = "cv_results_f.csv"

# --- DEBUG MODE CONFIGURATION ---
# Set to True to run a quick test on a single month (June 2024).
# Set to False for the full cross-validation run.
DEBUG_MODE = False

# --- Time-Series Cross-Validation Configuration ---
TRAIN_WINDOW_MONTHS = 12
TEST_PERIOD_MONTHS = 1
CV_START_DATE = pd.to_datetime("2024-06-01", utc=True)

# --- Parallel Processing Configuration ---
NUM_CPUS = 31

# --- Model Configuration (Unchanged) ---
POLLUTANTS = ['O3', 'NO2', 'SO2', 'PM10', 'PM25']
LOG_CONSTANT = 1e-9
SKEW_THRESHOLD_DV = 0.75
SKEW_THRESHOLD_IV_TEMPORAL = 1.0
MIN_MODEL_VARS = 4
IDW_POWER = 2
SIGN_CONSTRAINTS = {
    'O3': {'lu_hdr': -1, 'lu_ldr': -1, 'lu_ind': -1, 'lu_ug': 1, 'lu_art': -1, 'lu_for': 1, 'lu_rur': 1, 'build_fp': 0, 'build_vol': 0, 'build_var': 0, 'ndvi': 1, 'elevation': 0, 'pop': 0, 'rd_mt': -1, 'rd_prim': -1, 'rd_res': -1, 'rd_sec': -1, 'rd_ter': -1, 'rd_dis': 1, 'bus_dis': 0},
    'NO2': {'lu_hdr': 1, 'lu_ldr': 1, 'lu_ind': 1, 'lu_ug': -1, 'lu_art': 1, 'lu_for': -1, 'lu_rur': -1, 'build_fp': 1, 'build_vol': 1, 'build_var': -1, 'ndvi': -1, 'elevation': -1, 'pop': 1, 'rd_mt': 1, 'rd_prim': 1, 'rd_res': 1, 'rd_sec': 1, 'rd_ter': 1, 'rd_dis': -1, 'bus_dis': -1},
    'PM': {'lu_hdr': 1, 'lu_ldr': 1, 'lu_ind': 1, 'lu_ug': -1, 'lu_art': 1, 'lu_for': -1, 'lu_rur': -1, 'build_fp': 1, 'build_vol': 1, 'build_var': -1, 'ndvi': -1, 'elevation': -1, 'pop': 1, 'rd_mt': 1, 'rd_prim': 1, 'rd_res': 1, 'rd_sec': 1, 'rd_ter': 1, 'rd_dis': -1, 'bus_dis': -1},
    'SO2': {'lu_hdr': 0, 'lu_ldr': 0, 'lu_ind': 1, 'lu_ug': -1, 'lu_art': 0, 'lu_for': -1, 'lu_rur': -1, 'build_fp': 1, 'build_vol': 1, 'build_var': -1, 'ndvi': -1, 'elevation': 0, 'pop': 1, 'rd_mt': 1, 'rd_prim': 1, 'rd_res': 1, 'rd_sec': 1, 'rd_ter': 1, 'rd_dis': -1, 'bus_dis': -1}
}


# ==============================================================================
# Helper & Step Functions (Unchanged)
# ==============================================================================
def transform_temporal_ivs(df, cols_to_check, skew_threshold):
    df_transformed = df.copy()
    transform_params = {}
    for col in cols_to_check:
        if col in df_transformed.columns and pd.api.types.is_numeric_dtype(df_transformed[col]):
            skewness = skew(df_transformed[col].dropna())
            if abs(skewness) > skew_threshold:
                min_val = df_transformed[col].min()
                transform_params[col] = {'min_val': min_val}
                df_transformed[col] = np.log1p(df_transformed[col] - min_val)
    return df_transformed, transform_params

def apply_saved_temporal_transformations(df, transform_params):
    df_transformed = df.copy()
    for col, params in transform_params.items():
        if col in df_transformed.columns:
            min_val = params['min_val']
            df_transformed[col] = np.log1p(df_transformed[col] - min_val)
    return df_transformed

def prepare_dependent_variable(pollutant_df, pollutant):
    df = pollutant_df.copy()
    station_averages = df.groupby('station_id')['reading_value'].mean().reset_index()
    dv_col_name = f'avg_{pollutant}'
    station_averages.rename(columns={'reading_value': dv_col_name}, inplace=True)
    pollutant_skewness = skew(station_averages[dv_col_name].dropna())
    log_transform_decision = False
    if abs(pollutant_skewness) > SKEW_THRESHOLD_DV:
        station_averages[dv_col_name] = np.log(station_averages[dv_col_name] + LOG_CONSTANT)
        log_transform_decision = True
    return station_averages, dv_col_name, log_transform_decision

def prepare_spatial_dataset(station_identifiers_df, all_station_ivs_df):
    iv_cols_to_keep = ['id']
    rename_map = {}
    for col in all_station_ivs_df.columns:
        if col == 'id': continue
        if col.endswith('_cir'):
            new_col_name = col[:-4]
            iv_cols_to_keep.append(col)
            rename_map[col] = new_col_name
        elif not re.search(r"_\d+m_", col):
            iv_cols_to_keep.append(col)
    circular_ivs_df = all_station_ivs_df[iv_cols_to_keep].copy()
    circular_ivs_df.rename(columns=rename_map, inplace=True)
    merged_df = pd.merge(station_identifiers_df, circular_ivs_df, left_on='station_id', right_on='id')
    return merged_df

def step1_model_calibration(df, dv_name, a_priori_signs, min_vars):
    VIF_OVERRIDE_THRESHOLD = 10
    iv_names = [col for col in df.columns if col not in [dv_name, 'id', 'station_id']]
    df_clean = df.dropna(subset=[dv_name] + iv_names).copy()
    num_stations = len(df_clean)
    if num_stations < min_vars + 1: return None
    y, X_all = df_clean[dv_name], df_clean[iv_names]
    def get_X_with_const(X_df): return sm.add_constant(X_df, has_constant='add')
    def parse_base_var(col_name): return col_name.split('_')[0]
    def check_signs(coeffs, a_priori):
        for var, coeff in coeffs.drop('const', errors='ignore').items():
            base_var = parse_base_var(var)
            if (a_priori.get(base_var, 0) == 1 and coeff < 0) or \
               (a_priori.get(base_var, 0) == -1 and coeff > 0): return False
        return True
    best_initial_var, max_r2 = None, -1.0
    for iv in iv_names:
        model = sm.OLS(y, get_X_with_const(X_all[[iv]])).fit()
        if check_signs(model.params, a_priori_signs) and model.rsquared > max_r2: max_r2, best_initial_var = model.rsquared, iv
    if not best_initial_var: return None
    included_vars, current_r2 = [best_initial_var], max_r2
    while len(included_vars) < max(MIN_MODEL_VARS, num_stations // 3):
        best_new_var, best_r2_increase = None, 0.0
        for iv in [v for v in iv_names if v not in included_vars]:
            model = sm.OLS(y, get_X_with_const(X_all[included_vars + [iv]])).fit()
            r2_increase = model.rsquared - current_r2
            if r2_increase >= 0.01 and check_signs(model.params, a_priori_signs) and r2_increase > best_r2_increase:
                best_r2_increase, best_new_var = r2_increase, iv
        if best_new_var:
            included_vars.append(best_new_var)
            current_r2 += best_r2_increase
        else: break
    final_vars = included_vars.copy()
    while len(final_vars) > min_vars:
        X_current = get_X_with_const(X_all[final_vars])
        model_current = sm.OLS(y, X_current).fit()
        pvalues = model_current.pvalues.drop('const', errors='ignore')
        vif = pd.Series([variance_inflation_factor(X_current.values, i) for i in range(X_current.shape[1])], index=X_current.columns).drop('const', errors='ignore')
        candidates = pd.DataFrame({'vif': vif, 'pvalue': pvalues})
        candidates_to_remove = candidates[(candidates['vif'] > 10) | (candidates['pvalue'] > 0.05)]
        if candidates_to_remove.empty: break
        candidates_to_remove = candidates_to_remove.sort_values(by=['vif', 'pvalue'], ascending=False)
        variable_removed_this_pass = False
        for var_to_remove, stats in candidates_to_remove.iterrows():
            if stats.vif > VIF_OVERRIDE_THRESHOLD:
                final_vars.remove(var_to_remove)
                variable_removed_this_pass = True
                break
            temp_vars = [v for v in final_vars if v != var_to_remove]
            temp_model = sm.OLS(y, get_X_with_const(X_all[temp_vars])).fit()
            if check_signs(temp_model.params, a_priori_signs):
                final_vars.remove(var_to_remove)
                variable_removed_this_pass = True
                break
        if not variable_removed_this_pass: break
    final_model = sm.OLS(y, get_X_with_const(X_all[final_vars])).fit()
    return final_model

def step1_predict(model, iv_data, log_transformed_dv, log_constant):
    predictor_vars = [v for v in model.params.index if v != 'const']
    X_predict = iv_data[predictor_vars]
    X_predict = sm.add_constant(X_predict, has_constant='add', prepend=True)
    predictions_transformed = model.predict(X_predict)
    if log_transformed_dv:
        return np.exp(predictions_transformed) - log_constant
    return predictions_transformed

def calculate_residual_table(hourly_data_df, station_predictions_series):
    df = hourly_data_df.copy()
    df['step1_pred'] = df['station_id'].map(station_predictions_series)
    df['residual'] = df['reading_value'] - df['step1_pred']
    return df[['station_id', 'measured_time', 'residual']].dropna()

def prepare_temporal_dataset(data_df, meteorology_df):
    df = data_df.copy()
    df['hour_sin'] = np.sin(2 * np.pi * df['measured_time'].dt.hour / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['measured_time'].dt.hour / 24.0)
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['measured_time'].dt.dayofyear / 365.25)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['measured_time'].dt.dayofyear / 365.25)
    df['weekend'] = (df['measured_time'].dt.dayofweek >= 5).astype(int)
    final_df = pd.merge(df, meteorology_df, on=['measured_time', 'station_id'], how='left')
    final_df['wind_dir_sin'] = np.sin(2 * np.pi * final_df['wind_direction_deg'] / 360.0)
    final_df['wind_dir_cos'] = np.cos(2 * np.pi * final_df['wind_direction_deg'] / 360.0)
    return final_df.drop(columns=['wind_direction_deg']).dropna()

def step2_model_calibration(df, dv_name, continuous_predictors, binary_predictors):
    all_predictors = continuous_predictors + binary_predictors
    X = df[all_predictors]
    y = df[dv_name]
    scaler = StandardScaler()
    X_scaled_continuous = scaler.fit_transform(X[continuous_predictors])
    X_scaled_continuous_df = pd.DataFrame(X_scaled_continuous, index=X.index, columns=continuous_predictors)
    X_processed = pd.concat([X_scaled_continuous_df, X[binary_predictors]], axis=1)
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    l1_ratios = np.linspace(0.1, 1.0, 10)
    model = ElasticNetCV(l1_ratio=l1_ratios, cv=cv, random_state=42, n_jobs=1, max_iter=10000)
    model.fit(X_processed, y)
    return model, scaler, all_predictors

def step2_predict(model, scaler, df, all_predictors, continuous_predictors, binary_predictors):
    if df.empty or not all(p in df.columns for p in all_predictors): return pd.Series(dtype=float)
    X = df[all_predictors]
    X_scaled_continuous = scaler.transform(X[continuous_predictors])
    X_scaled_continuous_df = pd.DataFrame(X_scaled_continuous, index=X.index, columns=continuous_predictors)
    X_processed = pd.concat([X_scaled_continuous_df, X[binary_predictors]], axis=1)
    return pd.Series(model.predict(X_processed), index=X.index)

def cv_point_idw(values, coords, target_coord, power, n_neighbors=15, tree=None):
    if len(coords) < 1: return np.nan
    if tree is None: tree = cKDTree(coords)
    k = min(n_neighbors, len(coords))
    distances, indices = tree.query(np.array(target_coord).reshape(1, -1), k=k)
    distances, indices = distances.flatten(), indices.flatten()
    if distances.min() < 1e-9:
        return values[indices[np.argmin(distances)]]
    weights = 1.0 / (distances ** power)
    weighted_sum = np.sum(weights * values[indices])
    sum_of_weights = np.sum(weights)
    return weighted_sum / sum_of_weights if sum_of_weights > 0 else np.nan

# ==============================================================================
# MAIN WORKER FUNCTION (VECTORIZED AND OPTIMIZED)
# ==============================================================================
def process_fold(fold_id, pollutant, test_station_id, train_data, test_data, station_info_df,
    spatial_iv_df, meteo_df, all_stations_ivs, station_coords_map, station_data_df):
    try:
        # --- Step 1 & 2 Setup (Unchanged) ---
        pollutant_signs = SIGN_CONSTRAINTS.get('PM') if pollutant.startswith('PM') else SIGN_CONSTRAINTS.get(pollutant, {})
        li_station_ids = station_info_df[station_info_df['id'] != test_station_id]['id'].tolist()
        train_li_data = train_data[train_data['station_id'].isin(li_station_ids)]
        if train_li_data.empty: return []

        dv_df, dv_col, is_log_dv = prepare_dependent_variable(train_li_data, pollutant)
        step1_training_df = prepare_spatial_dataset(dv_df, spatial_iv_df)
        step1_model = step1_model_calibration(step1_training_df, dv_col, pollutant_signs, MIN_MODEL_VARS)
        if step1_model is None: return []

        step1_pred_for_test_station = step1_predict(step1_model, all_stations_ivs[all_stations_ivs['station_id'] == test_station_id], is_log_dv, LOG_CONSTANT).iloc[0]
        step1_preds_for_li_stations = step1_predict(step1_model, all_stations_ivs[all_stations_ivs['station_id'].isin(li_station_ids)], is_log_dv, LOG_CONSTANT)
        step1_preds_for_li_stations.index = all_stations_ivs[all_stations_ivs['station_id'].isin(li_station_ids)]['station_id']
        
        resid_df = calculate_residual_table(train_li_data, step1_preds_for_li_stations)
        if resid_df.empty: return []
        
        step2_training_data = prepare_temporal_dataset(resid_df, meteo_df)
        if step2_training_data.empty: return []
        
        meteo_vars_to_check = ['temperature_c', 'relative_humidity', 'wind_speed_ms']
        step2_training_data_transformed, transform_params = transform_temporal_ivs(step2_training_data, meteo_vars_to_check, SKEW_THRESHOLD_IV_TEMPORAL)
        continuous_vars = ['hour_sin', 'hour_cos', 'dayofyear_sin', 'dayofyear_cos'] + meteo_vars_to_check + ['wind_dir_sin', 'wind_dir_cos']
        binary_vars = ['weekend']
        step2_model, scaler, predictors = step2_model_calibration(step2_training_data_transformed, 'residual', continuous_vars, binary_vars)

        # --- Vectorized Step 2 Prediction for Test Station ---
        results_df = test_data.copy()
        step2_test_data_raw = prepare_temporal_dataset(results_df, meteo_df)
        step2_test_data_transformed = apply_saved_temporal_transformations(step2_test_data_raw, transform_params)
        step2_predicted_residuals = step2_predict(step2_model, scaler, step2_test_data_transformed, predictors, continuous_vars, binary_vars)
        step2_predicted_residuals.index = step2_test_data_transformed['measured_time']
        
        results_df = pd.merge(results_df, step2_predicted_residuals.rename('step2_modelled_residual'), left_on='measured_time', right_index=True, how='left')
        results_df['step2_modelled_residual'].fillna(0, inplace=True)
        results_df['step1_prediction'] = step1_pred_for_test_station
        results_df['step1_residual'] = results_df['reading_value'] - results_df['step1_prediction']
        
        mask = (np.sign(results_df['step2_modelled_residual']) != np.sign(results_df['step1_residual'])) & (results_df['step1_residual'] != 0)
        results_df.loc[mask, 'step2_modelled_residual'] = 0
        
        results_df['step2_prediction'] = results_df['step1_prediction'] + results_df['step2_modelled_residual']
        results_df['step2_residual'] = results_df['reading_value'] - results_df['step2_prediction']

        # --- Vectorized Step 3 IDW Calculation ---
        test_period_start, test_period_end = results_df['measured_time'].min(), results_df['measured_time'].max()
        li_stations_test_period_data = station_data_df[(station_data_df['measured_time'].between(test_period_start, test_period_end)) & (station_data_df['station_id'].isin(li_station_ids)) & (station_data_df['measurement'] == pollutant)]

        if not li_stations_test_period_data.empty:
            li_temporal_raw = prepare_temporal_dataset(li_stations_test_period_data, meteo_df)
            li_temporal_transformed = apply_saved_temporal_transformations(li_temporal_raw, transform_params)
            li_step2_preds = step2_predict(step2_model, scaler, li_temporal_transformed, predictors, continuous_vars, binary_vars)
            li_temporal_raw = li_temporal_raw.loc[li_step2_preds.index].copy()
            li_temporal_raw['s2_resid_pred'] = li_step2_preds
            li_temporal_raw['s1_pred'] = li_temporal_raw['station_id'].map(step1_preds_for_li_stations)
            li_temporal_raw['step2_error'] = li_temporal_raw['reading_value'] - (li_temporal_raw['s1_pred'] + li_temporal_raw['s2_resid_pred'])
            
            target_coords = station_coords_map[test_station_id]
            
            def calculate_idw_for_group(group):
                if len(group) < 2: return pd.Series({'idw_observed_prediction': np.nan, 'idw_error_correction': 0})
                coords = [station_coords_map[sid] for sid in group['station_id']]
                idw_obs = cv_point_idw(group['reading_value'].values, coords, target_coords, power=IDW_POWER)
                idw_err = cv_point_idw(group['step2_error'].values, coords, target_coords, power=IDW_POWER)
                return pd.Series({'idw_observed_prediction': idw_obs, 'idw_error_correction': idw_err})

            idw_results = li_temporal_raw.groupby('measured_time').apply(calculate_idw_for_group)
            results_df = pd.merge(results_df, idw_results, left_on='measured_time', right_index=True, how='left')
        else:
            results_df['idw_observed_prediction'] = np.nan
            results_df['idw_error_correction'] = 0

        results_df.fillna({'idw_error_correction': 0, 'idw_observed_prediction': np.nan}, inplace=True)
        
        mask_idw = (np.sign(results_df['idw_error_correction']) != np.sign(results_df['step2_residual'])) & (results_df['step2_residual'] != 0)
        results_df.loc[mask_idw, 'idw_error_correction'] = 0
        
        results_df['final_prediction'] = results_df['step2_prediction'] + results_df['idw_error_correction']
        results_df['final_residual'] = results_df['reading_value'] - results_df['final_prediction']
        
        results_df['fold_id'] = fold_id
        results_df['pollutant'] = pollutant
        results_df.rename(columns={'reading_value': 'observed_value'}, inplace=True)
        results_df['timestamp'] = results_df['measured_time']
        
        final_cols = ['fold_id', 'pollutant', 'station_id', 'timestamp', 'observed_value', 'idw_observed_prediction', 'step1_prediction', 'step1_residual', 'step2_modelled_residual', 'step2_prediction', 'step2_residual', 'idw_error_correction', 'final_prediction', 'final_residual']
        return results_df[final_cols].to_dict('records')

    except Exception:
        print(f"!!! ERROR in Fold {fold_id} for {pollutant} at station {test_station_id} !!!", flush=True)
        traceback.print_exc()
        return []

# ==============================================================================
# MAIN SCRIPT EXECUTION
# ==============================================================================
def main():
    print(f"--- Starting Three-Step Time-Series Cross-Validation (Hybrid Window) ---", flush=True)
    start_time = time.time()
    
    cv_start_date_to_use = CV_START_DATE
    debug_end_date = None
    if DEBUG_MODE:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", flush=True)
        print("!!!               RUNNING IN DEBUG MODE                     !!!", flush=True)
        print("!!!    Evaluation is limited to the first test month only.    !!!", flush=True)
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", flush=True)
        cv_start_date_to_use = pd.to_datetime("2024-06-01", utc=True)
        debug_end_date = cv_start_date_to_use + relativedelta(months=TEST_PERIOD_MONTHS)
        
    print("Loading data files...", flush=True)
    try:
        station_info_df = pd.read_csv(STATION_INFO_FILE)
        station_info_df['coords'] = station_info_df['location'].apply(ast.literal_eval)
        station_data_df = pd.read_csv(STATION_DATA_FILE)
        station_data_df['measured_time'] = pd.to_datetime(station_data_df['measured_time'], utc=True)
        spatial_iv_df = pd.read_csv(SPATIAL_IV_PATH)
        meteo_df = pd.read_csv(METEO_DATA_FILE)
        meteo_df['measured_time'] = pd.to_datetime(meteo_df['measured_time'], utc=True)
    except FileNotFoundError as e:
        print(f"Error loading data: {e}.", flush=True)
        return
        
    print("Pre-computing shared data objects...", flush=True)
    all_stations_ivs = prepare_spatial_dataset(station_info_df[['id']].rename(columns={'id':'station_id'}), spatial_iv_df)
    station_coords_map = station_info_df.set_index('id')['coords'].to_dict()
    
    tasks = []
    for pollutant in POLLUTANTS:
        pollutant_data = station_data_df[station_data_df['measurement'] == pollutant].copy().sort_values('measured_time')
        if pollutant_data.empty: continue
        print(f"\nGenerating hybrid window folds for {pollutant}...", flush=True)
        pollutant_start_date = pollutant_data['measured_time'].min()
        overall_end_date = pollutant_data['measured_time'].max()
        first_possible_test_start = pollutant_start_date + relativedelta(months=1)
        actual_cv_start_date = max(cv_start_date_to_use, first_possible_test_start)
        current_test_start = actual_cv_start_date
        fold_id = 1
        if current_test_start >= overall_end_date:
            print(f"Not enough data for {pollutant} to create any test folds after {cv_start_date_to_use.date()}. Skipping.", flush=True)
            continue
        while True:
            if current_test_start >= overall_end_date:
                print(f"Stopping fold generation for {pollutant}. Test period start ({current_test_start.date()}) is past available data.", flush=True)
                break
            if DEBUG_MODE and debug_end_date and current_test_start >= debug_end_date:
                print(f"DEBUG MODE: Reached debug end date ({debug_end_date.date()}). Stopping fold generation for {pollutant}.", flush=True)
                break
            current_test_end = current_test_start + relativedelta(months=TEST_PERIOD_MONTHS)
            theoretical_train_start = current_test_start - relativedelta(months=TRAIN_WINDOW_MONTHS)
            actual_train_start = max(theoretical_train_start, pollutant_start_date)
            train_fold_data = pollutant_data[(pollutant_data['measured_time'] >= actual_train_start) & (pollutant_data['measured_time'] < current_test_start)]
            if train_fold_data.empty:
                 print(f"Skipping Fold {fold_id} for {pollutant}: No training data found for period starting {actual_train_start.date()}.", flush=True)
                 current_test_start += relativedelta(months=TEST_PERIOD_MONTHS)
                 fold_id += 1
                 continue
            print(f"Fold {fold_id}: Train Period = {actual_train_start.date()} to {current_test_start.date()}, Test Period = {current_test_start.date()} to {current_test_end.date()}", flush=True)
            for station_id in station_info_df['id']:
                test_fold_data = pollutant_data[(pollutant_data['measured_time'] >= current_test_start) & (pollutant_data['measured_time'] < current_test_end) & (pollutant_data['station_id'] == station_id)]
                if not test_fold_data.empty:
                    task_args = (fold_id, pollutant, station_id, train_fold_data, test_fold_data, station_info_df, spatial_iv_df, meteo_df, all_stations_ivs, station_coords_map, station_data_df)
                    tasks.append(task_args)
            current_test_start += relativedelta(months=TEST_PERIOD_MONTHS)
            fold_id += 1
            
    if not tasks:
        print("No tasks created. Check date ranges and data availability.", flush=True)
        return
        
    print(f"\nCreated {len(tasks)} total tasks to run in parallel on {NUM_CPUS} CPUs.", flush=True)
    results_list = Parallel(n_jobs=NUM_CPUS, verbose=10)(delayed(process_fold)(*task) for task in tasks)
    
    print("\nParallel processing complete. Aggregating results...", flush=True)
    final_results = [item for sublist in results_list if sublist is not None for item in sublist]
    
    if final_results:
        results_df = pd.DataFrame(final_results)
        print(f"Sorting {len(results_df)} results before saving...", flush=True)
        sort_order = ['pollutant', 'fold_id', 'timestamp', 'station_id']
        results_df.sort_values(by=sort_order, inplace=True)
        results_df.to_csv(OUTPUT_FILE, index=False)
        print(f"Successfully saved sorted results to {OUTPUT_FILE}", flush=True)
    else:
        print("No results were generated.", flush=True)
        
    end_time = time.time()
    print(f"Total execution time: {(end_time - start_time) / 60:.2f} minutes", flush=True)

if __name__ == '__main__':
    main()

--- Starting Air Quality Model Analysis ---
Successfully loaded 'cv_results_f.csv'

Generating: Concentration-based Accuracy Reports...


  baseline_metrics = df.groupby('pollutant').apply(safe_calculate_group_metrics, 'observed_value', 'idw_observed_prediction').reset_index()
  final_prediction_metrics = df.groupby('pollutant').apply(safe_calculate_group_metrics, 'observed_value', 'final_prediction').reset_index()
  hourly_metrics_final = df.groupby(['pollutant', 'hour']).apply(safe_calculate_group_metrics, 'observed_value', 'final_prediction').reset_index()
  hourly_metrics_baseline = df.groupby(['pollutant', 'hour']).apply(safe_calculate_group_metrics, 'observed_value', 'idw_observed_prediction').reset_index()
  monthly_metrics_final = df.groupby(['pollutant', 'month']).apply(safe_calculate_group_metrics, 'observed_value', 'final_prediction').reset_index()
  monthly_metrics_baseline = df.groupby(['pollutant', 'month']).apply(safe_calculate_group_metrics, 'observed_value', 'idw_observed_prediction').reset_index()
  station_metrics_final = df.groupby(['pollutant', 'station_id']).apply(safe_calculate_group_metrics, 'obse

...All concentration-based reports Done!

Generating: Composite EAQI & Driving Pollutant Reports...
...EAQI reports Done!

--- Analysis Complete ---
All report files saved in: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\last_loocv\processed_res


In [None]:
# Visualize results

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from sklearn.metrics import confusion_matrix

def create_plots(base_path):
    """
    Main function to load summary CSVs and generate plots using Matplotlib and Seaborn.

    Args:
        base_path (str): The absolute path to the directory containing the summary CSVs.
    """
    print("--- Starting Plot Generation (Matplotlib) ---")

    # --- Setup ---
    plots_path = os.path.join(base_path, 'plots')
    if not os.path.exists(plots_path):
        os.makedirs(plots_path)
        print(f"Created directory: {plots_path}")

    # --- File Paths ---
    hourly_accuracy_file = os.path.join(base_path, 'hourly_accuracy.csv')
    monthly_accuracy_file = os.path.join(base_path, 'monthly_accuracy.csv')
    stepwise_accuracy_file = os.path.join(base_path, 'stepwise_accuracy.csv')
    station_accuracy_file = os.path.join(base_path, 'station_accuracy.csv')
    eaqi_details_file = os.path.join(base_path, 'eaqi_calculation_details.csv')

    # Helper function to save plots
    def save_plot(figure, filename):
        filepath = os.path.join(plots_path, filename)
        figure.savefig(filepath, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close(figure) # Close the figure to free up memory

    # --- 1. Boxplots for Hourly Metrics ---
    try:
        df_hourly = pd.read_csv(hourly_accuracy_file)
        df_hourly_melted = df_hourly.melt(
            id_vars=['pollutant', 'hour'],
            value_vars=['r2_baseline', 'r2_final', 'mae_baseline', 'mae_final', 'rmse_baseline', 'rmse_final'],
            var_name='metric_model', value_name='value'
        )
        df_hourly_melted[['metric', 'model']] = df_hourly_melted['metric_model'].str.split('_', expand=True)
        df_hourly_melted['model'] = df_hourly_melted['model'].replace({'baseline': 'Baseline', 'final': 'Final Prediction'})

        for metric in ['r2', 'mae', 'rmse']:
            fig, ax = plt.subplots(figsize=(12, 7))
            sns.boxplot(
                data=df_hourly_melted[df_hourly_melted['metric'] == metric],
                x='pollutant', y='value', hue='model', ax=ax
            )
            ax.set_title(f'Hourly {metric.upper()} Distribution by Pollutant', fontsize=16)
            ax.set_xlabel('Pollutant', fontsize=12)
            ax.set_ylabel(f'{metric.upper()} Value', fontsize=12)
            ax.grid(axis='y', linestyle='--', alpha=0.7)
            save_plot(fig, f'hourly_boxplot_{metric}.png')

    except FileNotFoundError:
        print(f"Warning: '{hourly_accuracy_file}' not found. Skipping hourly boxplots.")

    # --- 2. Line Charts for Hourly and Monthly Accuracy ---
    def create_line_charts(df, time_col, title_prefix, filename_prefix):
        pollutants = sorted(df['pollutant'].unique())
        for metric in ['r2', 'mae', 'rmse']:
            fig, axes = plt.subplots(len(pollutants), 1, figsize=(12, 5 * len(pollutants)), sharex=True)
            if len(pollutants) == 1: axes = [axes] # Ensure axes is iterable for a single pollutant
            
            for ax, pollutant in zip(axes, pollutants):
                df_pollutant = df[df['pollutant'] == pollutant]
                ax.plot(df_pollutant[time_col], df_pollutant[f'{metric}_baseline'], marker='o', linestyle='-', label='Baseline')
                ax.plot(df_pollutant[time_col], df_pollutant[f'{metric}_final'], marker='x', linestyle='--', label='Final Prediction')
                ax.set_ylabel(metric.upper())
                ax.set_title(pollutant)
                ax.grid(True, linestyle='--', alpha=0.6)
            
            handles, labels = axes[0].get_legend_handles_labels()
            fig.legend(handles, labels, loc='upper right')
            fig.suptitle(f'{title_prefix} {metric.upper()} by Pollutant', fontsize=18, y=1.02)
            plt.xlabel(time_col.capitalize())
            plt.tight_layout(rect=[0, 0, 1, 0.98])
            save_plot(fig, f'{filename_prefix}_{metric}.png')

    try:
        df_hourly = pd.read_csv(hourly_accuracy_file)
        create_line_charts(df_hourly, 'hour', 'Hourly Mean', 'hourly_line')
    except FileNotFoundError:
        print(f"Warning: '{hourly_accuracy_file}' not found. Skipping hourly line charts.")

    try:
        df_monthly = pd.read_csv(monthly_accuracy_file)
        create_line_charts(df_monthly, 'month', 'Monthly Mean', 'monthly_line')
    except FileNotFoundError:
        print(f"Warning: '{monthly_accuracy_file}' not found. Skipping monthly line charts.")

    # --- 3. Bar Charts for Stepwise Accuracy ---
    try:
        df_stepwise = pd.read_csv(stepwise_accuracy_file)
        for metric in ['r2', 'mae', 'rmse']:
            fig, ax = plt.subplots(figsize=(12, 7))
            sns.barplot(data=df_stepwise, x='step', y=metric, hue='pollutant', ax=ax, order=['step1', 'step2', 'final'])
            ax.set_title(f'Stepwise Model Performance: {metric.upper()}', fontsize=16)
            ax.set_xlabel('Prediction Step', fontsize=12)
            ax.set_ylabel(metric.upper(), fontsize=12)
            ax.grid(axis='y', linestyle='--', alpha=0.7)
            save_plot(fig, f'stepwise_accuracy_{metric}.png')
    except FileNotFoundError:
        print(f"Warning: '{stepwise_accuracy_file}' not found. Skipping stepwise bar charts.")

    # --- 4. Bar Charts for Station Accuracy ---
    try:
        df_station = pd.read_csv(station_accuracy_file)
        df_station_melted = df_station.melt(
            id_vars=['pollutant', 'station_id'], value_vars=['r2_baseline', 'r2_final'],
            var_name='metric_model', value_name='r2_value'
        )
        df_station_melted['model'] = df_station_melted['metric_model'].str.replace('r2_', '').str.capitalize().replace({'Baseline': 'Baseline', 'Final': 'Final Prediction'})

        for pollutant in df_station_melted['pollutant'].unique():
            df_pollutant = df_station_melted[df_station_melted['pollutant'] == pollutant].sort_values('r2_value')
            top_5 = df_pollutant.tail(10); bottom_5 = df_pollutant.head(10)
            df_plot = pd.concat([bottom_5, top_5])

            fig, ax = plt.subplots(figsize=(15, 8))
            sns.barplot(data=df_plot, x='station_id', y='r2_value', hue='model', ax=ax)
            ax.set_title(f'Best and Worst Station R² for {pollutant}', fontsize=16)
            ax.set_xlabel('Station ID', fontsize=12)
            ax.set_ylabel('R² Score', fontsize=12)
            ax.tick_params(axis='x', rotation=45)
            ax.grid(axis='y', linestyle='--', alpha=0.7)
            save_plot(fig, f'station_accuracy_r2_{pollutant}.png')
    except FileNotFoundError:
        print(f"Warning: '{station_accuracy_file}' not found. Skipping station accuracy bar charts.")

    # --- 5. Confusion Matrices for EAQI ---
    try:
        df_eaqi = pd.read_csv(eaqi_details_file).dropna(subset=['observed_EAQI', 'predicted_EAQI_final', 'predicted_EAQI_baseline'])
        eaqi_levels = sorted(df_eaqi['observed_EAQI'].unique())
        eaqi_labels = [f'Level {int(l)}' for l in eaqi_levels]
        models = {'final': 'predicted_EAQI_final', 'baseline': 'predicted_EAQI_baseline'}

        for model_name, pred_col in models.items():
            cm = confusion_matrix(df_eaqi['observed_EAQI'], df_eaqi[pred_col], labels=eaqi_levels)
            with np.errstate(divide='ignore', invalid='ignore'):
                cm_normalized = np.nan_to_num(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])
            
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", ax=ax, xticklabels=eaqi_labels, yticklabels=eaqi_labels)
            ax.set_title(f'EAQI Normalized Confusion Matrix ({model_name.capitalize()} Model)', fontsize=16)
            ax.set_xlabel('Predicted EAQI Level', fontsize=12)
            ax.set_ylabel('Observed EAQI Level', fontsize=12)
            save_plot(fig, f'eaqi_confusion_matrix_{model_name}.png')

    except FileNotFoundError:
        print(f"Warning: '{eaqi_details_file}' not found. Skipping EAQI confusion matrix plots.")
    except Exception as e:
        print(f"An error occurred during EAQI confusion matrix generation: {e}")

    print("\n--- Plot Generation Complete ---")
    print(f"All plots saved in: {plots_path}")

if __name__ == '__main__':
    processed_results_path = r'C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\last_loocv\processed_res'
    try:
        import matplotlib
        import seaborn
        import sklearn
    except ImportError:
        print("\n---\nWarning: Required libraries are not installed. Please install them to run this script:")
        print("pip install pandas matplotlib seaborn scikit-learn\n---")
        exit()
    create_plots(processed_results_path)



--- Starting Plot Generation (Matplotlib) ---
Saved: hourly_boxplot_r2.png
Saved: hourly_boxplot_mae.png
Saved: hourly_boxplot_rmse.png
Saved: hourly_line_r2.png
Saved: hourly_line_mae.png
Saved: hourly_line_rmse.png
Saved: monthly_line_r2.png
Saved: monthly_line_mae.png
Saved: monthly_line_rmse.png
Saved: stepwise_accuracy_r2.png
Saved: stepwise_accuracy_mae.png
Saved: stepwise_accuracy_rmse.png
Saved: station_accuracy_r2_NO2.png
Saved: station_accuracy_r2_O3.png
Saved: station_accuracy_r2_PM10.png
Saved: station_accuracy_r2_PM2.5.png
Saved: station_accuracy_r2_SO2.png
Saved: eaqi_confusion_matrix_final.png
Saved: eaqi_confusion_matrix_baseline.png

--- Plot Generation Complete ---
All plots saved in: C:\Users\Austin\Documents\DATABANK\Masters\Thesis\Code\final_method\last_loocv\processed_res\plots
