#### This notebook takes raw GLORYS data and performs the following operations using Dask: 
1. Concatenate the disjointed GLORYS files over some region 
2. Saves the concatenated data 
3. Uses concatenated data to compute a daily climatology 


In [1]:
import os, gc; from os.path import exists
os.chdir('/vortexfs1/home/anthony.meza/CTWPC/scripts')
main_dir = "/vortexfs1/home/anthony.meza/CTWPC"
plotsdir = lambda x="": main_dir + "/plots/" + x
GLORYS_dir = lambda x="": main_dir + "/GLORYS_data" + x
GLORYS_data_dir = lambda x="": main_dir + "/GLORYS_processed/" + x
ERA5_data_dir = lambda x="": main_dir + "/ERA5_data/" + x

In [2]:
from help_funcs import * 
import xarray as xr
import pandas as pd
import netCDF4 as nc
from pathlib import Path
from natsort import natsorted
import matplotlib.pyplot as plt
import dask_labextension

In [3]:
from dask_jobqueue import SLURMCluster  # setup dask cluster 
cluster = SLURMCluster(
    cores=36,
    processes=1,
    memory='192GB',
    walltime='02:00:00',
    queue='compute',
    interface='ib0')
print(cluster.job_script())
cluster.scale(jobs=16)
from dask.distributed import Client
client = Client(cluster)
client

#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -p compute
#SBATCH -n 1
#SBATCH --cpus-per-task=36
#SBATCH --mem=179G
#SBATCH -t 02:00:00

/vortexfs1/home/anthony.meza/mambaforge/envs/atm_rivers/bin/python -m distributed.cli.dask_worker tcp://172.16.3.106:43862 --nthreads 36 --memory-limit 178.81GiB --name dummy-name --nanny --death-timeout 60 --interface ib0



In [5]:
GLORYS_dir = r"/vortexfs1/share/cdx/hseo/Data/MERCATOR/NEP_daily" 
results = [str(result) for result in list(Path(GLORYS_dir).rglob("*.[nN][cC]"))] #get all files
results = natsorted(results) #sort all files 

years = natsorted(list(set([result[41:45] for result in results])))
months = natsorted(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'])

In [6]:
def _preprocess(ds):
    return ds[["thetao", "zos"]].sel(latitude = slice(-2, 60), 
                                     longitude = slice(-150, -75), depth = slice(0, 50))

In [7]:
ds = xr.open_mfdataset(
        results,
        data_vars="minimal",
        coords="minimal",
        compat="override",
        preprocess=_preprocess,
        parallel=True,
        chunks={"latitude":-1, "longitude":-1, "time":-1, "depth":2},
        engine="scipy")

In [8]:
#save the dataset before processing 
ds.to_netcdf(GLORYS_data_dir("GLORYS_NE_PAC.nc"),
             mode = "w", format = "NETCDF4", 
             engine = "netcdf4", compute = True)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [9]:
#do not change chunks, this is the fastest wat
ds = xr.open_mfdataset(GLORYS_data_dir("GLORYS_NE_PAC.nc"), 
                            data_vars="minimal",
                            coords="minimal",
                            compat="override",
                            preprocess=_preprocess,
                            parallel=True,
                            chunks={"longitude":-1, "latitude":-1, "time":720, "depth":1}, 
                            engine="netcdf4")

In [36]:
ds = ds.convert_calendar('noleap') #remove leap years from operations
ds_climatology = ds.groupby("time.dayofyear").mean(dim = "time")

In [None]:
ds_climatology.to_netcdf(GLORYS_data_dir("GLORYS_SFC_Climatology.nc"),
             mode = "w", format = "NETCDF4", 
             engine = "netcdf4", compute = True)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
