In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import sys
import os
import gc
sys.path.insert(0, '/home/jupyter/')
from hourly_pollution_prediction.process_and_join.bbox import * 

lat_min = 28.6
lat_max = 33.4
lon_min = -98.9
lon_max = -88.3

path = '../../data/full_month_data'
nc_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.nc')]

no2_var = [
    'vertical_column_troposphere_tempo',
    'vertical_column_troposphere_uncertainty_tempo',
    'eff_cloud_fraction_tempo',
    'PRES_weather',
    'TMP_weather',
    'DPT_weather',
    'SPFH_weather',
    'WDIR_weather',
    'WIND_weather',
    'GUST_weather',
    'VIS_weather',
    'TCDC_weather',
    'ACPC01_weather',
    'LWGNT_weather',
    'SWGNT_weather',
    'NO2_geoscf',
    'NO_geoscf',
    'nox_mass_lbs_emissions',
    'impervious_area',
    'population',
    'road_density',
    'water_map_LW'
]

stats_list = []

for selected_var in no2_var:
    datasets = []
    for file in nc_files:
        if selected_var in xr.open_dataset(file).variables:
            ds = xr.open_dataset(file)[selected_var]
            datasets.append(ds)

    if not datasets:
        continue  # skip if variable not found in any file

    # Concatenate along "time" (adjust if dimension differs)
    combined = xr.concat(datasets, dim="time")

    # Compute statistics (scalar values)
    mean_val = float(combined.mean().values)  # mean over all dims
    std_val = float(combined.std().values)

    stats_list.append({
        "variable": selected_var,
        "mean": mean_val,
        "std": std_val
    })
    del combined
    del datasets
    gc.collect()

# Convert to DataFrame
df_stats = pd.DataFrame(stats_list)

# Save to CSV
df_stats.to_csv("normalization_stats_no2.csv", index=False)


In [None]:
## import xarray as xr
import pandas as pd
import numpy as np
import sys
import os
import gc
sys.path.insert(0, '/home/jupyter/')
from hourly_pollution_prediction.process_and_join.bbox import * 

lat_min = 28.6
lat_max = 33.4
lon_min = -98.9
lon_max = -88.3

path = '../../data/full_month_data_o3'
nc_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.nc')]

o3_var = ['column_amount_o3_tempo',
 'fc_tempo',
 'PRES_weather',
 'TMP_weather',
 'DPT_weather',
 'SPFH_weather',
 'WDIR_weather',
 'WIND_weather',
 'GUST_weather',
 'VIS_weather',
 'TCDC_weather',
 'ACPC01_weather',
 'LWGNT_weather',
 'SWGNT_weather',
 'O3_geoscf',
 'nox_mass_lbs_emissions',
 'so2_mass_lbs_emissions',
 'impervious_area',
 'population',
 'road_density',
 'water_map_LW']

stats_list = []

for selected_var in o3_var:
    datasets = []
    for file in nc_files:
        if selected_var in xr.open_dataset(file).variables:
            ds = xr.open_dataset(file)[selected_var]
            datasets.append(ds)

    if not datasets:
        continue  # skip if variable not found in any file

    # Concatenate along "time" (adjust if dimension differs)
    combined = xr.concat(datasets, dim="time")

    # Compute statistics (scalar values)
    mean_val = float(combined.mean().values)  # mean over all dims
    std_val = float(combined.std().values)

    stats_list.append({
        "variable": selected_var,
        "mean": mean_val,
        "std": std_val
    })
    del combined
    del datasets
    gc.collect()
    
# Convert to DataFrame
df_stats = pd.DataFrame(stats_list)

# Save to CSV
df_stats.to_csv("normalization_stats_o3.csv", index=False)
