In [4]:
import xarray as xr
import pandas as pd
import numpy as np
import sys
import os
from joblib import Parallel, delayed


In [2]:
# NO2
path = '../../data/model_data/EPA_sensor_only_no2'
nc_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.nc')]
# Create output directory if it doesn't exist
output_dir = '../../data/model_data/EPA_sensor_only_no2_norm'

df = pd.read_csv('normalization_stats_no2.csv')

# Make lookup dictionaries once (outside the loop)
mean_dict = df.set_index("variable")["mean"].to_dict()
std_dict = df.set_index("variable")["std"].to_dict()

def normalize_file(file, mean_dict, std_dict, out_dir=output_dir):
    # Open dataset
    xa = xr.open_dataset(file)

    # Apply normalization
    for var in xa.data_vars:
        if var in mean_dict and var in std_dict:
            mean = mean_dict[var]
            std = std_dict[var]
            if std == 0 or pd.isna(std):
                xa[var] = xa[var] - mean
            else:
                xa[var] = (xa[var] - mean) / std

    # Optionally save normalized dataset
    out_path = f"{out_dir}/{file.split('/')[-1]}"
    xa.to_netcdf(out_path)
    return out_path


# Parallel execution
normalized_files = Parallel(n_jobs=-1)(  # n_jobs=-1 = use all cores
    delayed(normalize_file)(file, mean_dict, std_dict, out_dir=output_dir)
    for file in nc_files
)

In [6]:
os.mkdir('../../data/model_data/EPA_sensor_only_o3_norm')

In [None]:
# NO2
path = '../../data/model_data/EPA_sensor_only_o3'
nc_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.nc')]
# Create output directory if it doesn't exist
output_dir = '../../data/model_data/EPA_sensor_only_o3_norm'

df = pd.read_csv('normalization_stats_o3.csv')

# Make lookup dictionaries once (outside the loop)
mean_dict = df.set_index("variable")["mean"].to_dict()
std_dict = df.set_index("variable")["std"].to_dict()

def normalize_file(file, mean_dict, std_dict, out_dir=output_dir):
    # Open dataset
    xa = xr.open_dataset(file)

    # Apply normalization
    for var in xa.data_vars:
        if var in mean_dict and var in std_dict:
            mean = mean_dict[var]
            std = std_dict[var]
            if std == 0 or pd.isna(std):
                xa[var] = xa[var] - mean
            else:
                xa[var] = (xa[var] - mean) / std

    # Optionally save normalized dataset
    out_path = f"{out_dir}/{file.split('/')[-1]}"
    xa.to_netcdf(out_path)
    return out_path


# Parallel execution
normalized_files = Parallel(n_jobs=-1)(  # n_jobs=-1 = use all cores
    delayed(normalize_file)(file, mean_dict, std_dict, out_dir=output_dir)
    for file in nc_files
)

In [None]:
print('hi')