In [1]:
from pathlib import Path
import os
import shutil
from datetime import date, timedelta
import xarray as xr
import netCDF4 as nc
import numpy as np
import glob
import dask

import time

In [2]:
PATH_URMA_TRAIN = "/scratch/RTMA/alex.schein/URMA_train_test/train"
PATH_URMA_TEST = "/scratch/RTMA/alex.schein/URMA_train_test/test"

PATH_HRRR_TRAIN = "/scratch/RTMA/alex.schein/Regridded_HRRR_train_test/train_spatiallyrestricted_f01"
PATH_HRRR_TEST = "/scratch/RTMA/alex.schein/Regridded_HRRR_train_test/test_spatiallyrestricted_f01"

In [3]:
files_train_urma = sorted(glob.glob(PATH_URMA_TRAIN+"/*.nc"))
files_test_urma = sorted(glob.glob(PATH_URMA_TEST+"/*.nc"))

files_train_hrrr = sorted(glob.glob(PATH_HRRR_TRAIN+"/*.nc"))
files_test_hrrr = sorted(glob.glob(PATH_HRRR_TEST+"/*.nc"))

In [4]:
def check_files_for_problems(input_files):
    #Checks if the coords from the first dataset (asssumed to be good) are in all files
    #Far from comprehensive but usually catches if a regridded file failed for whatever reason
    flag = 0
    ds0 = xr.open_dataset(input_files[0], decode_timedelta=True)
    for i, filepath in enumerate(input_files):
        ds1 = xr.open_dataset(filepath, decode_timedelta=True)
        if i % (int(len(input_files)/50))==0:
            print(f"{i}/{len(input_files)} files checked")
        if list(ds0.coords) != list(ds1.coords):
            print(f"Failure in {filepath}")
            flag = 1
    return flag

In [5]:
def concat_netcdfs(input_files, output_filepath):
    #input: sorted glob list of filepaths to directory of loose netcdfs
    #input: filepath, including name and .nc extension, of output master netcdf

    #Check for problems in the files before wasting time 
    flag = check_files_for_problems(input_files) #Uncomment if running this for unchecked files
    # flag = 0 #Comment if running this for unchecked files

    if flag:
        print(f"Problems found. Not concatenating until all problems fixed")
    else:
        if not os.path.exists(output_filepath): #prevents accidental overwrites if file was already written
            trunc_input_files = input_files[1:] #to fix indexing issues - very bad but whatever
        
            ds0 = xr.open_dataset(input_files[0], decode_timedelta=True)
            for i, filename in enumerate(trunc_input_files):
                ds1 = xr.open_dataset(trunc_input_files[i], decode_timedelta=True)
                ds0 = xr.concat([ds0,ds1], dim="valid_time_dim")
                if i % (int(len(trunc_input_files)/50))==0:
                    print(f"{i}/{len(trunc_input_files)} concatenated")
                
            ds_concat = ds0.assign_coords(sample_idx=("valid_time_dim",[i for i in range(len(input_files))])) #uses input_files, not trunc version, as length should equal # of loose netcdf files in original directory
            ds_concat = ds_concat.swap_dims(dims_dict={"valid_time_dim":"sample_idx"})
        
            ds_concat.to_netcdf(output_filepath)
            print(f"{output_filepath} written to disk")

    return

In [6]:
# only_00z_12z_files_hrrr_train = [file for file in files_train_hrrr if "00z" in file or "12z" in file]
# only_00z_12z_files_hrrr_test = [file for file in files_test_hrrr if "00z" in file or "12z" in file]

# only_00z_12z_files_urma_train = [file for file in files_train_urma if "00z" in file or "12z" in file]
# only_00z_12z_files_urma_test = [file for file in files_test_urma if "00z" in file or "12z" in file]

# #### PRIOR TO 5/23: these filenames only refer to 00z and 12z data packed into one file
# concat_netcdfs(only_00z_12z_files_urma_test, "/scratch/RTMA/alex.schein/URMA_train_test/test_urma_00z_12z.nc")
# concat_netcdfs(only_00z_12z_files_urma_train, "/scratch/RTMA/alex.schein/URMA_train_test/train_urma_00z_12z.nc")

# concat_netcdfs(only_00z_12z_files_hrrr_test, "/scratch/RTMA/alex.schein/Regridded_HRRR_train_test/test_hrrr_00z_12z.nc")
# concat_netcdfs(only_00z_12z_files_hrrr_train, "/scratch/RTMA/alex.schein/Regridded_HRRR_train_test/train_hrrr_00z_12z.nc")

In [7]:
#### POST 5/23: these files refer to ALL times
## NOTE (5/23): NEED TO REGRID HRRR FILES! Then spatially restrict, THEN concatenate...

# path_test_urma = "/scratch/RTMA/alex.schein/URMA_train_test/test_urma_alltimes.nc"
# path_train_urma = "/scratch/RTMA/alex.schein/URMA_train_test/train_urma_alltimes.nc"

path_train_hrrr = "/scratch/RTMA/alex.schein/Regridded_HRRR_train_test/train_hrrr_alltimes_f01.nc"
path_test_hrrr = "/scratch/RTMA/alex.schein/Regridded_HRRR_train_test/test_hrrr_alltimes_f01.nc"


## (6/10) Only need to make new URMA files if big changes are made (e.g. new variables, new domain). Forecast time change only --> only new HRRR files need to be made
# if not os.path.exists(path_train_urma):
#     concat_netcdfs(files_train_urma, path_train_urma)
# else:
#     print(f"{path_train_urma} already exists")

# if not os.path.exists(path_test_urma):
#     concat_netcdfs(files_test_urma, path_test_urma)
# else:
#     print(f"{path_test_urma} already exists")
    

if not os.path.exists(path_train_hrrr):
    concat_netcdfs(files_train_hrrr, path_train_hrrr)
else:
    print(f"{path_train_hrrr} already exists")

if not os.path.exists(path_test_hrrr):
    concat_netcdfs(files_test_hrrr, path_test_hrrr)
else:
    print(f"{path_test_hrrr} already exists")


0/26280 files checked
525/26280 files checked
1050/26280 files checked
1575/26280 files checked
2100/26280 files checked
2625/26280 files checked
3150/26280 files checked
3675/26280 files checked
4200/26280 files checked
4725/26280 files checked
5250/26280 files checked
5775/26280 files checked
6300/26280 files checked
6825/26280 files checked
7350/26280 files checked
7875/26280 files checked
8400/26280 files checked
8925/26280 files checked
9450/26280 files checked
9975/26280 files checked
10500/26280 files checked
11025/26280 files checked
11550/26280 files checked
12075/26280 files checked
12600/26280 files checked
13125/26280 files checked
13650/26280 files checked
14175/26280 files checked
14700/26280 files checked
15225/26280 files checked
15750/26280 files checked
16275/26280 files checked
16800/26280 files checked
17325/26280 files checked
17850/26280 files checked
18375/26280 files checked
18900/26280 files checked
19425/26280 files checked
19950/26280 files checked
20475/2628