In [1]:
from pathlib import Path
import os
import shutil
from datetime import date, timedelta
import xarray as xr
import netCDF4 as nc
import numpy as np

In [2]:
IDX_MIN_LON = 595+1 #NOTE: different from URMA grid! 
IDX_MIN_LAT = 645

IMG_SIZE_LON = 180
IMG_SIZE_LAT = 180

TIME_LIST = [str(i).zfill(2) for i in range(24)] #["00", "12"]

In [3]:
VAR_OF_INTEREST = "pressurf"

PATH_HRRR_ORIGINAL = f"/data1/projects/RTMA/alex.schein/Regridded_HRRR/{VAR_OF_INTEREST}"
PATH_TRAIN = f"/data1/projects/RTMA/alex.schein/Regridded_HRRR_train_test/LOOSE_FILES/train_spatiallyrestricted_f01/{VAR_OF_INTEREST}"
PATH_TEST = f"/data1/projects/RTMA/alex.schein/Regridded_HRRR_train_test/LOOSE_FILES/test_spatiallyrestricted_f01/{VAR_OF_INTEREST}"

In [4]:
# START_DATE_TRAIN = date(2021,1,1) #should be jan 1, 2021
# END_DATE_TRAIN = date(2023,12,31) #should be dec 31, 2023
# NUM_DAYS_TRAIN = END_DATE_TRAIN-START_DATE_TRAIN

# START_DATE_TEST = date(2024,1,1) #should be jan 1, 2024
# END_DATE_TEST = date(2024,12,31) #should be dec 31, 2024
# NUM_DAYS_TEST = END_DATE_TEST-START_DATE_TEST

In [5]:
def restrict_files(START_DATE, END_DATE, TIME_LIST, PATH_ORIGINAL, PATH_NEW, IDX_MIN_LON=596, IDX_MIN_LAT=645, IMG_SIZE_LON=180, IMG_SIZE_LAT=180):
    # NOTE: default idx mins are for HRRR - need to change if using this for URMA!
    NUM_DAYS = END_DATE-START_DATE
    for i in range(NUM_DAYS.days+1):
        DATE_STR = date.strftime(START_DATE + timedelta(days=i), "%Y%m%d")
        filenames = os.listdir(f"{PATH_ORIGINAL}/{DATE_STR}")
        for time in TIME_LIST:
            # if len(filenames)<len(TIME_LIST): #this is for degenerate cases like f01 on 2020/12/31 23z
                
            # else:
            filename = [x for x in filenames if f"t{time}z" in x and ".idx" not in x][0]
            new_filename = f"hrrr_regridded_spatiallyrestricted_{DATE_STR}_t{time}z.nc"
            if not os.path.exists(f"{PATH_NEW}/{new_filename}"):
                var = xr.open_dataset(f"{PATH_ORIGINAL}/{DATE_STR}/{filename}", engine="cfgrib", decode_timedelta=True)
                var_subset = var.isel(y=slice(IDX_MIN_LAT, IDX_MIN_LAT+IMG_SIZE_LAT),
                                      x=slice(IDX_MIN_LON, IDX_MIN_LON+IMG_SIZE_LON))
                var_subset.to_netcdf(f"{PATH_NEW}/{new_filename}")
                print(f"{new_filename} written to {PATH_NEW}")
            else:
                print(f"{new_filename} already exists in {PATH_NEW}. No action taken")
                
    return

In [7]:
restrict_files(START_DATE=date(2020,12,31),
               END_DATE=date(2020,12,31), 
               TIME_LIST=["23"],
               PATH_ORIGINAL=PATH_HRRR_ORIGINAL,
               PATH_NEW=PATH_TRAIN)

hrrr_regridded_spatiallyrestricted_20201231_t23z.nc written to /data1/projects/RTMA/alex.schein/Regridded_HRRR_train_test/LOOSE_FILES/train_spatiallyrestricted_f01/d2m


In [None]:
# restrict_files(START_DATE=date(2021,1,1),
#                END_DATE=date(2023,12,31), 
#                TIME_LIST=TIME_LIST,
#                PATH_ORIGINAL=PATH_HRRR_ORIGINAL,
#                PATH_NEW=PATH_TRAIN)

# restrict_files(START_DATE=date(2024,1,1),
#                END_DATE=date(2024,12,31), 
#                TIME_LIST=TIME_LIST,
#                PATH_ORIGINAL=PATH_HRRR_ORIGINAL,
#                PATH_NEW=PATH_TEST)

In [None]:
# # All 2021-2023 files to PATH_TRAIN directory
# for i in range(NUM_DAYS_TRAIN.days + 1):
#     DATE_STR_TRAIN = date.strftime(START_DATE_TRAIN + timedelta(days=i), "%Y%m%d")
#     filenames = os.listdir(PATH_HRRR_ORIGINAL+"/"+DATE_STR_TRAIN)
#     for time in TIME_LIST:
#         filename = [x for x in filenames if f"t{time}z" in x and ".idx" not in x][0] #unlike URMA, these have forecast times in the filenames, so can mess up if just using digits
#         new_filename = f"hrrr_regridded_spatiallyrestricted_{DATE_STR_TRAIN}_t{time}z.nc"
#         if not os.path.exists(PATH_TRAIN+f"/{new_filename}"): #skip files that already exist in destination folder
#             t2m = xr.open_dataset(PATH_HRRR_ORIGINAL+"/"+DATE_STR_TRAIN+"/"+filename, engine='cfgrib', decode_timedelta=True)
#             t2m_subset = t2m.isel(y=slice(IDX_MIN_LAT, IDX_MIN_LAT+IMG_SIZE_LAT),
#                                   x=slice(IDX_MIN_LON, IDX_MIN_LON+IMG_SIZE_LON))
#             t2m_subset.to_netcdf(PATH_TRAIN+"/"+new_filename)#, encoding={"t2m":{"zlib":True, "complevel":9}}) #(6/16) DON'T USE ENCODING - though it saves space, it automatically enables chunking which is SUPER slow!
#             print(f"{new_filename} written to {PATH_TRAIN}")

In [None]:
# # # All 2024 files to PATH_TEST directory
# for i in range(NUM_DAYS_TEST.days + 1):
#     DATE_STR_TEST = date.strftime(START_DATE_TEST + timedelta(days=i), "%Y%m%d")
#     filenames = os.listdir(PATH_HRRR_ORIGINAL+"/"+DATE_STR_TEST)
#     for time in TIME_LIST:
#         filename = [x for x in filenames if f"t{time}z" in x and ".idx" not in x][0] #unlike URMA, these have forecast times in the filenames, so can mess up if just using digits
#         new_filename = f"hrrr_regridded_spatiallyrestricted_{DATE_STR_TEST}_t{time}z.nc"
#         if not os.path.exists(PATH_TEST+f"/{new_filename}"): #skip files that already exist in destination folder
#             t2m = xr.open_dataset(PATH_HRRR_ORIGINAL+"/"+DATE_STR_TEST+"/"+filename, engine='cfgrib', decode_timedelta=True)
#             t2m_subset = t2m.isel(y=slice(IDX_MIN_LAT, IDX_MIN_LAT+IMG_SIZE_LAT),
#                                   x=slice(IDX_MIN_LON, IDX_MIN_LON+IMG_SIZE_LON))
#             t2m_subset.to_netcdf(PATH_TEST+"/"+new_filename)#, encoding={"t2m":{"zlib":True, "complevel":9}})
#             print(f"{new_filename} written to {PATH_TEST}")

In [None]:
### Do the 2020/12/31 23z file on its own here

In [None]:
# t2m = xr.open_dataset(PATH_HRRR_ORIGINAL+"/"+"20201231/hrrr_regridded_20201231_t23z_f01.grib2", engine='cfgrib', decode_timedelta=True)
# t2m_subset = t2m.isel(y=slice(IDX_MIN_LAT, IDX_MIN_LAT+IMG_SIZE_LAT),
#                       x=slice(IDX_MIN_LON, IDX_MIN_LON+IMG_SIZE_LON))
# t2m_subset.to_netcdf(PATH_TRAIN+"/"+"hrrr_regridded_spatiallyrestricted_20201231_t23z.nc")#, encoding={"t2m":{"zlib":True, "complevel":9}})

In [None]:
## Lat/lon index testing
# t2m = xr.open_dataset("/scratch/RTMA/alex.schein/test_hrrr_newgrid.grib2", engine='cfgrib', decode_timedelta=True)
# t2m_subset = t2m.isel(y=slice(IDX_MIN_LAT, IDX_MIN_LAT+IMG_SIZE_LAT),
#                       x=slice(IDX_MIN_LON, IDX_MIN_LON+IMG_SIZE_LON))
# t2m_subset.to_netcdf("/scratch/RTMA/alex.schein/test_hrrr_newgrid_regridded.grib2", encoding={"t2m":{"zlib":True, "complevel":9}})