In [2]:
import os
import glob
import xarray as xr
import numpy as np

import dask
from dask_jobqueue import PBSCluster
from dask.distributed import Client

In [2]:
# Setup PBSCluster
cluster = PBSCluster(
    cores=1,                                                   # The number of cores you want
    memory='40GB',                                             # Amount of memory
    processes=1,                                               # How many processes
    queue='casper',                                            # The type of queue to utilize
    local_directory='/glade/work/afoster',                     # Use your local directory
    resource_spec='select=1:ncpus=1:mem=25GB',                 # Specify resources
    log_directory='/glade/derecho/scratch/afoster/dask_logs',  # log directory
    account='P08010000',                                       # Input your project ID here
    walltime='10:00:00',                                       # Amount of wall time
    interface='ext')

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34527 instead


In [3]:
cluster.scale(100)
dask.config.set({
    'distributed.dashboard.link': 'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/proxy/{port}/status'
})
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/afoster/proxy/34527/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/afoster/proxy/34527/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.210:36689,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/afoster/proxy/34527/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [3]:
input_dir = "/glade/campaign/cgd/tss/projects/TRENDY2024/TRENDY_outputs/S3/month_1"
output_dir = "/glade/campaign/cgd/tss/projects/TRENDY2024/TRENDY_outputs/S3/fixed_files"
tag = "TRENDY2024_f09_clm60_S3.clm2.h0."

In [4]:
all_files = sorted(glob.glob(os.path.join(input_dir, "TRENDY2024_f09_clm60_S3.clm2.h0.*.nc")))
variables = np.unique([os.path.basename(f).split(tag)[-1].split('.')[0] for f in all_files])

In [8]:
with open("my_file.txt", "w") as f:
    f.writelines([f"{v}\n" for v in variables])

In [None]:
for variable in variables:
    print(variable)
    files = sorted(glob.glob(os.path.join(input_dir, f"TRENDY2024_f09_clm60_S3.clm2.h0.{variable}*.nc")))
    ds = xr.open_mfdataset(files, combine='nested', concat_dim='time', parallel=True,
                           autoclose=True, chunks={'time': 10})
    print(variable)
    year_zero = ds['time.year'][0].values
    ds['time'] = xr.cftime_range(str(year_zero), periods=len(ds.time),
                                 freq='MS', calendar='noleap')
    out_file = os.path.join(output_dir, f"{tag}{variable}.nc")
    ds.to_netcdf(out_file)

ACTUAL_IMMOB
ACTUAL_IMMOB
