In [1]:
import glob
import os
import numpy as np
import xarray as xr
import netCDF4 as nc
import pandas as pd

In [2]:
NAME_TO_VAR = {
    "geopotential": "zg",
    "u_component_of_wind": "u",
    "v_component_of_wind": "v",
    "temperature": "ta",
    "specific_humidity": "hus",
    "air_temperature": "tas",
}

variables = [
    "air_temperature",
    "geopotential",
    "specific_humidity",
    "temperature",
    "u_component_of_wind",
    "v_component_of_wind"
]
res = "5.625deg"
path = f"/data0/datasets/weatherbench/data/esgf/cmip6/{res}"

HOURS_PER_YEAR = 1456
DEFAULT_PRESSURE_LEVELS = [50, 250, 500, 600, 700, 850, 925]
num_shards_per_year = 4
assert HOURS_PER_YEAR % num_shards_per_year == 0
num_hrs_per_shard = HOURS_PER_YEAR // num_shards_per_year

In [3]:
save_dir = f"/data0/datasets/weatherbench/data/esgf/cmip6/new_cmip/{res}_npz"
years_it = range(1975, 2014, 5)
for year_it in years_it:
    print(year_it)
    year_string = f"{year_it}01010600-{year_it + 5}01010000"
    np_vars = [{} for _ in range(5)]
    normalize_mean = [{} for _ in range(5)]
    normalize_std = [{} for _ in range(5)]
    climatology = [{} for _ in range(5)]
    for var in variables:
        ps = glob.glob(os.path.join(path, var, f"*{year_string}_{res}.nc"))
        ds = xr.load_dataset(ps[0])
        code = NAME_TO_VAR[var]
        years = pd.DatetimeIndex(ds["time"].values).year
        indices_by_year = [[] for _ in range(5)]
        for index, year in enumerate(years):
            if year >= year_it and year < year_it + 5:
                indices_by_year[year-year_it].append(index)
        for index in range(5):
            indices_by_year[index] = indices_by_year[index][-HOURS_PER_YEAR:]

        for year in range(year_it, year_it + 5):
            if len(ds[code].shape) == 3:
                np_vars[year-year_it][var] = np.expand_dims((ds[code][indices_by_year[year-year_it]]).to_numpy(), axis=1)
            else:
                assert len(ds[code].shape) == 4
                for level in DEFAULT_PRESSURE_LEVELS:
                    lev = float(level * 100)
                    ds_level = ds[code][indices_by_year[year-year_it]].sel(plev=[lev])
                    np_vars[year-year_it][f"{var}_{level}"] = ds_level.to_numpy()
            
    for index in range(5):
        for key in np_vars[index].keys():
            normalize_mean[index][key] = np_vars[index][key].mean(axis=(0, 2, 3))
            normalize_std[index][key] = np_vars[index][key].std(axis=(0, 2, 3))
            climatology[index][key] = np_vars[index][key].mean(axis=0)
    
    for year in range(year_it, year_it + 5):
        for shard_id in range(num_shards_per_year):
            start_id = shard_id * num_hrs_per_shard
            end_id = start_id + num_hrs_per_shard
            sharded_data = {k: np_vars[year - year_it][k][start_id:end_id] for k in np_vars[year - year_it].keys()}
            np.savez(
                os.path.join(save_dir, f"{year}_{shard_id}.npz"),
                **sharded_data,
            )
        normalized_mean_data = {k: normalize_mean[year - year_it][k] for k in normalize_mean[year - year_it].keys()}
        np.savez(
            os.path.join(save_dir, f"{year}_mean.npz"),
            **normalized_mean_data,
        )
        normalized_std_data = {k: normalize_std[year - year_it][k] for k in normalize_std[year - year_it].keys()}
        np.savez(
            os.path.join(save_dir, f"{year}_std.npz"),
            **normalized_std_data,
        )
        climatology_data = {k: climatology[year - year_it][k] for k in climatology[year - year_it].keys()}
        np.savez(
            os.path.join(save_dir, f"{year}_climatology.npz"),
            **climatology_data,
        )

1975
1980
1985
1990
1995
2000
2005
2010


In [4]:
def func_mean(ps):
    d = dict()
    for file in ps:
        data = np.load(file)
        if d == dict():
            d = {k: data[k] for k in data.keys()}
        else:
            d = {k: np.concatenate((d[k], data[k]), axis=0) for k in data.keys()}
    print({k: d[k].shape for k in d.keys()})
    d = {k: np.expand_dims(np.mean(d[k], axis=0), axis=0) for k in d.keys()}
    print({k: d[k].shape for k in d.keys()})
    print("end")
    return d

In [5]:
res = "5.625deg"
save_dir_path = f"/data0/datasets/weatherbench/data/esgf/cmip6/new_cmip/{res}_npz"

ps = glob.glob(os.path.join(save_dir_path, f"train/*_mean.npz"))
train_mean = func_mean(ps)

ps = glob.glob(os.path.join(save_dir_path, f"val/*_mean.npz"))
val_mean = func_mean(ps)

ps = glob.glob(os.path.join(save_dir_path, f"test/*_mean.npz"))
test_mean = func_mean(ps)

ps = glob.glob(os.path.join(save_dir_path, f"train/*_climatology.npz"))
train_clim = func_mean(ps)

ps = glob.glob(os.path.join(save_dir_path, f"val/*_climatology.npz"))
val_clim = func_mean(ps)

ps = glob.glob(os.path.join(save_dir_path, f"test/*_climatology.npz"))
test_clim = func_mean(ps)

{'air_temperature': (32,), 'geopotential_50': (32,), 'geopotential_250': (32,), 'geopotential_500': (32,), 'geopotential_600': (32,), 'geopotential_700': (32,), 'geopotential_850': (32,), 'geopotential_925': (32,), 'specific_humidity_50': (32,), 'specific_humidity_250': (32,), 'specific_humidity_500': (32,), 'specific_humidity_600': (32,), 'specific_humidity_700': (32,), 'specific_humidity_850': (32,), 'specific_humidity_925': (32,), 'temperature_50': (32,), 'temperature_250': (32,), 'temperature_500': (32,), 'temperature_600': (32,), 'temperature_700': (32,), 'temperature_850': (32,), 'temperature_925': (32,), 'u_component_of_wind_50': (32,), 'u_component_of_wind_250': (32,), 'u_component_of_wind_500': (32,), 'u_component_of_wind_600': (32,), 'u_component_of_wind_700': (32,), 'u_component_of_wind_850': (32,), 'u_component_of_wind_925': (32,), 'v_component_of_wind_50': (32,), 'v_component_of_wind_250': (32,), 'v_component_of_wind_500': (32,), 'v_component_of_wind_600': (32,), 'v_compon

In [6]:
def f1(ps):
    d = dict()
    for file in ps:
        data = np.load(file)
        if d == dict():
            d = {k: data[k] for k in data.keys()}
        else:
            d = {k: np.concatenate((d[k], data[k]), axis=0) for k in data.keys()}
    print({k: d[k].shape for k in d.keys()})
    print("end")
    return d

In [7]:
ps = glob.glob(os.path.join(save_dir_path, f"train/*_mean.npz"))
train_dmean = f1(ps)

ps = glob.glob(os.path.join(save_dir_path, f"val/*_mean.npz"))
val_dmean = f1(ps)

ps = glob.glob(os.path.join(save_dir_path, f"test/*_mean.npz"))
test_dmean = f1(ps)

ps = glob.glob(os.path.join(save_dir_path, f"train/*_std.npz"))
train_dstd = f1(ps)

ps = glob.glob(os.path.join(save_dir_path, f"val/*_std.npz"))
val_dstd = f1(ps)

ps = glob.glob(os.path.join(save_dir_path, f"test/*_std.npz"))
test_dstd = f1(ps)

{'air_temperature': (32,), 'geopotential_50': (32,), 'geopotential_250': (32,), 'geopotential_500': (32,), 'geopotential_600': (32,), 'geopotential_700': (32,), 'geopotential_850': (32,), 'geopotential_925': (32,), 'specific_humidity_50': (32,), 'specific_humidity_250': (32,), 'specific_humidity_500': (32,), 'specific_humidity_600': (32,), 'specific_humidity_700': (32,), 'specific_humidity_850': (32,), 'specific_humidity_925': (32,), 'temperature_50': (32,), 'temperature_250': (32,), 'temperature_500': (32,), 'temperature_600': (32,), 'temperature_700': (32,), 'temperature_850': (32,), 'temperature_925': (32,), 'u_component_of_wind_50': (32,), 'u_component_of_wind_250': (32,), 'u_component_of_wind_500': (32,), 'u_component_of_wind_600': (32,), 'u_component_of_wind_700': (32,), 'u_component_of_wind_850': (32,), 'u_component_of_wind_925': (32,), 'v_component_of_wind_50': (32,), 'v_component_of_wind_250': (32,), 'v_component_of_wind_500': (32,), 'v_component_of_wind_600': (32,), 'v_compon

In [8]:
def get_std(dmean, dstd):
    ds ={}
    for var in dmean.keys():
        mean, std = dmean[var], dstd[var]
        variance = (
            (std**2).mean(axis=0)
            + (mean**2).mean(axis=0)
            - mean.mean(axis=0) ** 2
        )
        std = np.sqrt(variance)
        ds[var] = np.expand_dims(std, axis=0)
    return ds


In [9]:
train_std = get_std(train_dmean, train_dstd)
val_std = get_std(val_dmean, val_dstd)
test_std = get_std(test_dmean, test_dstd)

In [10]:
np.savez(
    os.path.join(save_dir_path, f"train/climatology.npz"),
    **train_clim,
)

np.savez(
    os.path.join(save_dir_path, f"val/climatology.npz"),
    **val_clim,
)

np.savez(
    os.path.join(save_dir_path, f"test/climatology.npz"),
    **test_clim,
)

In [11]:
np.savez(
    os.path.join(save_dir_path, f"train/normalize_std.npz"),
    **train_std,
)

np.savez(
    os.path.join(save_dir_path, f"val/normalize_std.npz"),
    **val_std,
)

np.savez(
    os.path.join(save_dir_path, f"test/normalize_std.npz"),
    **test_std,
)

np.savez(
    os.path.join(save_dir_path, f"train/normalize_mean.npz"),
    **train_mean,
)

np.savez(
    os.path.join(save_dir_path, f"val/normalize_mean.npz"),
    **val_mean,
)

np.savez(
    os.path.join(save_dir_path, f"test/normalize_mean.npz"),
    **test_mean,
)

In [12]:
xyz = np.load(save_dir_path + "/normalize_std.npz")

print(xyz)
{k: xyz[k].shape for k in xyz.keys()}

FileNotFoundError: [Errno 2] No such file or directory: '/data0/datasets/weatherbench/data/esgf/cmip6/new_cmip/5.625deg_npz/normalize_std.npz'

: 

In [None]:
save_dir_path

In [None]:
res = "5.625deg"
path = f"/data0/datasets/weatherbench/data/esgf/cmip6/{res}"
save_dir = f"/data0/datasets/weatherbench/data/esgf/cmip6/{res}_npz"
year_it = 1850
var = variables[0]
year_string = f"{year_it}01010600-{year_it + 5}01010000"
ps = glob.glob(os.path.join(path, var, f"*{year_string}_{res}.nc"))
ds = xr.load_dataset(ps[0])

In [None]:
lat = np.array(ds["lat"])
lon = np.array(ds["lon"])
np.save(os.path.join(save_dir, "lat.npy"), lat)
np.save(os.path.join(save_dir, "lon.npy"), lon)

In [None]:
lat

In [None]:
xyz = np.load(os.path.join(save_dir, "lat.npy"))
print(xyz)

In [None]:
import numpy as np
res = "5.625deg"
era5_path = f"/data0/datasets/weatherbench/data/weatherbench/era5/{res}_npz/"
cmip6_path = f"/data0/datasets/weatherbench/data/esgf/cmip6/{res}_npz/"

xyz = np.load(era5_path + "/train/climatology.npz")
abc = np.load(cmip6_path + "/train/climatology.npz")

for key in xyz.keys():
    print(xyz[key].shape)

for key in abc.keys():
    print(abc[key].shape)

In [None]:
xyz = np.load(era5_path + "/normalize_mean.npz")
abc = np.load(cmip6_path + "/normalize_mean.npz")
for key in xyz.keys():
    print(xyz[key].shape)

for key in abc.keys():
    print(abc[key].shape)