In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '/home/jupyter/')
from hourly_pollution_prediction.process_and_join.bbox import * 

lat_min = 28.6
lat_max = 33.4
lon_min = -98.9
lon_max = -88.3


In [9]:
import os
from concurrent.futures import ProcessPoolExecutor

path = '../../data/full_month_data'
nc_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.nc')]
# Create output directory if it doesn't exist
output_dir = '../../data/model_data/EPA_sensor_only'
# os.makedirs(output_dir, exist_ok=True)

# nc_files=nc_files[15:]

def clean_bounds_attrs(ds):
    for var in ds.data_vars:
        if 'bounds' in ds[var].attrs:
            bounds_attr = ds[var].attrs['bounds']
            if not isinstance(bounds_attr, str):
                del ds[var].attrs['bounds']
    return ds

var = [
    'vertical_column_troposphere_tempo',
    'vertical_column_troposphere_uncertainty_tempo',
    'eff_cloud_fraction_tempo',
    'PRES_weather',
    'TMP_weather',
    'DPT_weather',
    'SPFH_weather',
    'WDIR_weather',
    'WIND_weather',
    'GUST_weather',
    'VIS_weather',
    'TCDC_weather',
    'ACPC01_weather',
    'LWGNT_weather',
    'SWGNT_weather',
    'NO2_geoscf',
    'NO_geoscf',
    'nox_mass_lbs_emissions',
    'impervious_area',
    'population',
    'road_density',
    'water_map_LW',
    'y_sensor_no2'
]

def process_file(file):
    try:
        ds = xr.open_dataset(file)
        ds = ds[var]

        for t in ds.time.values:
            ds_time = ds.sel(time=t)
            ds_time = clean_bounds_attrs(ds_time)

            timestamp_str = str(t)[:19].replace(":", "").replace("-", "").replace("T", "_")
            filename = f"time_{timestamp_str}.nc"
            filepath = os.path.join(output_dir, filename)

            ds_time.to_netcdf(filepath)
            print(f"Saved: {filepath}")
    except Exception as e:
        print(f"Error processing {file}: {e}")

if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)
    with ProcessPoolExecutor() as executor:
        executor.map(process_file, nc_files)


Saved: ../../data/model_data/EPA_sensor_only/time_20250301_090000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20241201_070000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250101_070000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250401_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250201_070000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250501_070000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20241101_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250101_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250201_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20241201_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250401_090000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20241101_090000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250501_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20250301_100000.nc
Saved: ../../data/model_data/EPA_s

In [8]:
nc_files[15:]

['../../data/full_month_data/model_2024-11.nc',
 '../../data/full_month_data/model_2024-12.nc',
 '../../data/full_month_data/model_2025-01.nc',
 '../../data/full_month_data/model_2025-02.nc',
 '../../data/full_month_data/model_2025-03.nc',
 '../../data/full_month_data/model_2025-04.nc',
 '../../data/full_month_data/model_2025-05.nc']

In [None]:
def clean_bounds_attrs(ds):
    for var in ds.data_vars:
        if 'bounds' in ds[var].attrs:
            bounds_attr = ds[var].attrs['bounds']
            if not isinstance(bounds_attr, str):
                del ds[var].attrs['bounds']
    return ds

for file in nc_files:
    ds = xr.open_dataset(file)

    var = [
        'vertical_column_troposphere_tempo',
        'vertical_column_troposphere_uncertainty_tempo',
        'eff_cloud_fraction_tempo',
        'PRES_weather',
        'TMP_weather',
        'DPT_weather',
        'SPFH_weather',
        'WDIR_weather',
        'WIND_weather',
        'GUST_weather',
        'VIS_weather',
        'TCDC_weather',
        'ACPC01_weather',
        'LWGNT_weather',
        'SWGNT_weather',
        'NO2_geoscf',
        'NO_geoscf',
        'nox_mass_lbs_emissions',
        'impervious_area',
        'population',
        'road_density',
        'water_map_LW',
        'y_sensor_no2'
    ]

    ds = ds[var]
    
    for i, t in enumerate(ds.time.values):
        ds_time = ds.sel(time=t)

        # Clean bounds attributes
        ds_time = clean_bounds_attrs(ds_time)

        timestamp_str = str(t)[:19].replace(":", "").replace("-", "").replace("T", "_")
        filename = f"time_{timestamp_str}.nc"
        filepath = os.path.join(output_dir, filename)

        ds_time.to_netcdf(filepath)
        print(f"Saved: {filepath}")

Saved: ../../data/model_data/EPA_sensor_only/time_20230802_100000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230802_110000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230802_120000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230802_130000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230802_140000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230802_150000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_070000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_080000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_090000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_100000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_120000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_130000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_140000.nc
Saved: ../../data/model_data/EPA_sensor_only/time_20230804_150000.nc
Saved: ../../data/model_data/EPA_s

In [20]:
ds_time

In [None]:

# Create output directory if it doesn't exist
output_dir = '../../data/split_by_time'
os.makedirs(output_dir, exist_ok=True)

# Loop over each time step
for i, t in enumerate(ds.time.values):
    # Select the time slice
    ds_time = ds.sel(time=t)

    # Build a filename using the timestamp
    timestamp_str = str(t)[:19].replace(":", "").replace("-", "").replace("T", "_")
    filename = f"time_{i:03d}_{timestamp_str}.nc"
    filepath = os.path.join(output_dir, filename)

    # Save the single-time dataset
    ds_time.to_netcdf(filepath)
