In [1]:
import glob, re, os
import matplotlib as plt
import numpy as np
import xarray as xr
import pandas as pd
import threading
import OpenVisus as ov
from datetime import datetime, timedelta
from tqdm import tqdm
from dask import delayed
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
# !pip install dask_jobqueue

In [3]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [4]:
######## File paths ################
lustre_scratch    = "/glade/work/dpanta"
era5_surface_data = "/gdex/data/special_projects/harshah/ARCO/e5.oper.an.sfc"

In [5]:
cluster = PBSCluster(
        job_name = 'dask-osdf-25',
        cores = 1,
        memory = '4GiB',
        processes = 1,
        local_directory = lustre_scratch + '/dask/spill',
        log_directory = lustre_scratch + '/dask/logs/',
        resource_spec = 'select=1:ncpus=1:mem=4GB',
        queue = 'casper',
        account='P43713000',
        walltime = '3:00:00',
        #interface = 'ib0'
        interface = 'ext'
    )

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33277 instead


In [6]:
client = Client(cluster)
# Scale the cluster and display cluster dashboard URL
n_workers =5
cluster.scale(n_workers)
client.wait_for_workers(n_workers = n_workers)
cluster


0,1
Dashboard: http://128.117.211.221:33277/status,Workers: 5
Total threads: 5,Total memory: 20.00 GiB

0,1
Comm: tcp://128.117.211.221:40331,Workers: 5
Dashboard: http://128.117.211.221:33277/status,Total threads: 5
Started: Just now,Total memory: 20.00 GiB

0,1
Comm: tcp://128.117.208.177:39373,Total threads: 1
Dashboard: http://128.117.208.177:34499/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.177:36925,
Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-_ce94dfo,Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-_ce94dfo
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 58.04 MiB,Spilled bytes: 0 B
Read bytes: 0.90 MiB,Write bytes: 10.32 MiB

0,1
Comm: tcp://128.117.208.175:38379,Total threads: 1
Dashboard: http://128.117.208.175:38357/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.175:32899,
Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-hta0wfay,Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-hta0wfay
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 58.03 MiB,Spilled bytes: 0 B
Read bytes: 1.81 GiB,Write bytes: 2.01 GiB

0,1
Comm: tcp://128.117.208.178:38267,Total threads: 1
Dashboard: http://128.117.208.178:38531/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.178:43949,
Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-qcwfr2_e,Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-qcwfr2_e
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 33.9%,Last seen: Just now
Memory usage: 133.98 MiB,Spilled bytes: 0 B
Read bytes: 5.31 MiB,Write bytes: 2.25 MiB

0,1
Comm: tcp://128.117.208.174:41315,Total threads: 1
Dashboard: http://128.117.208.174:38963/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.174:38651,
Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-dlqrbb70,Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-dlqrbb70
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 58.03 MiB,Spilled bytes: 0 B
Read bytes: 451.96 kiB,Write bytes: 451.96 kiB

0,1
Comm: tcp://128.117.208.178:35643,Total threads: 1
Dashboard: http://128.117.208.178:38465/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.178:35279,
Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-8jqaxu8a,Local directory: /glade/work/dpanta/dask/spill/dask-scratch-space/worker-8jqaxu8a
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 35.1%,Last seen: Just now
Memory usage: 133.95 MiB,Spilled bytes: 0 B
Read bytes: 5.30 MiB,Write bytes: 2.30 MiB


In [None]:
%%time
import os, gc
import numpy as np
import OpenVisus as ov
from dask.distributed import get_client, as_completed

PATH   = "/glade/derecho/scratch/dpanta/idx/era5/2T/1950_bitmasks_1758838374/bm_time_major/era5_sfc_2T_1950.idx"
YEAR   = 1950

IS_LEAP = (YEAR % 4 == 0) and (YEAR % 100 != 0 or YEAR % 400 == 0)
Z_LEN = 8784 if IS_LEAP else 8760

SLAB = 720           
RETRIES = 2          

QUALITY = -4
db = ov.LoadDataset(PATH)
probe = db.read(time=YEAR, quality=QUALITY, z=[0,1])
ny, nx = probe.shape[1:]
dtype  = probe.dtype
out    = np.empty((Z_LEN, ny, nx), dtype=dtype)

# Worker task: open dataset once per worker, create an Access per read
def _read_slab(z0, z1, path, year):
    import OpenVisus as _ov, gc as _gc
    # Per-worker cache
    if not hasattr(_read_slab, "_db"):
        _read_slab._db = _ov.LoadDataset(path)
    acc = _read_slab._db.createAccess()
    arr = _read_slab._db.read(time=year, quality=QUALITY, z=[z0, z1], access=acc)
    del acc
    _gc.collect()
    return z0, z1, arr

ranges = [(i, min(i + SLAB, Z_LEN)) for i in range(0, Z_LEN, SLAB)]

futs = [client.submit(_read_slab, z0, z1, PATH, YEAR, retries=RETRIES, pure=False)
        for (z0, z1) in ranges]

for f in as_completed(futs):
    z0, z1, arr = f.result()
    out[z0:z1] = arr
    del arr
    gc.collect()

print(out.shape, out.dtype)


Exception: begin query failed wrong time

In [None]:

Z, H, W = out.shape

lat = np.linspace(90.0, -90.0, H, dtype=np.float32)
w_lat = np.cos(np.deg2rad(lat)).astype(np.float32)   # 
w2d = w_lat[:, None]                                  

valid = np.isfinite(out)                            

num = np.sum(np.where(valid, out, 0.0) * w2d, axis=(1,2))            
den = np.sum(np.where(valid, 1.0, 0.0) * w2d,        axis=(1,2))     

gmst = num / den                                      
print(gmst)


In [None]:
IDX_PATH = "/glade/work/dpanta/era5/idx/2T/era5_sfc_2T_zip.idx"
quality=0
W,H = x_bound[1]-x_bound[0], y_bound[1]-y_bound[0]
lat = np.linspace(90.0, -90.0, H, dtype=np.float32)
W1D = np.cos(np.deg2rad(lat)).astype(np.float32)
FULL_DEN = float(W1D.sum() * W)

_DB = None
_ACCESS = None

def _get_db_and_access():
    global _DB, _ACCESS
    if _DB is None:
        _DB = ov.LoadDataset(IDX_PATH)
        try:
            _ACCESS = _DB.createAccess()
        except Exception:
            _ACCESS = None
    return _DB, _ACCESS

def weighted_global_mean_fast(a: np.ndarray) -> float:
    a = np.asarray(a, dtype=np.float32)
    if np.isfinite(a).all():
        row_sum = a.sum(axis=1, dtype=np.float32)
        return float(np.dot(W1D, row_sum) / FULL_DEN)
    row_sum = np.nansum(a, axis=1).astype(np.float32)
    row_cnt = np.sum(np.isfinite(a), axis=1, dtype=np.int32)
    den = float(np.dot(W1D, row_cnt.astype(np.float32)))
    return float(np.dot(W1D, row_sum) / den) if den > 0 else np.nan

def gmst_hours_block_threaded(t0: int, n_hours: int, max_threads: int = 4) -> list[float]:
    db, access = _get_db_and_access()

    def read_reduce(t: int) -> float:
        if access is not None:
            a = db.read(time=t, x=x_bound,y=y_bound,access=access)
        else:
            a = db.read(time=t,x=x_bound,y=y_bound,)
        return weighted_global_mean_fast(a)

    out = [None] * n_hours
    with ThreadPoolExecutor(max_workers=max_threads) as ex:
        futs = {ex.submit(read_reduce, t0 + k): k for k in range(n_hours)}
        for f in as_completed(futs):
            k = futs[f]
            out[k] = f.result()
    return out

start_idx = hour_index_from_iso_big(f"{YEAR}-01-01 00")
end_idx   = hour_index_from_iso_big(f"{YEAR+1}-01-01 00")
total_hours = end_idx - start_idx

BLOCK_HOURS = 48 * 30  
n_blocks = (total_hours + BLOCK_HOURS - 1) // BLOCK_HOURS

tasks = []
for b in range(n_blocks):
    b_start = start_idx + b * BLOCK_HOURS
    b_n = min(BLOCK_HOURS, total_hours - b * BLOCK_HOURS)
    tasks.append(delayed(gmst_hours_block_threaded)(b_start, b_n, max_threads=4))


In [None]:
%%time
blocks = dask.compute(*tasks)
gmst_vals = np.fromiter((v for block in blocks for v in block), dtype=np.float32, count=total_hours)
print("Annual GMST", YEAR, ":", float(np.nanmean(gmst_vals)))

## GMST functions

## Load data and compute GMST

In [None]:
# import numpy as np
# import pandas as pd
# import dask
# from dask import delayed
# import OpenVisus as ov

# IDX_PATH = "/glade/work/dpanta/era5/idx/2T/era5_sfc_2T_zip.idx"
# ov.LoadDataset(IDX_PATH)
# H, W = 721, 1440

# lat = np.linspace(90.0, -90.0, H, dtype=np.float64)
# W1D = np.cos(np.deg2rad(lat)).astype(np.float64)
# W2D = np.repeat(W1D[:, None], W, axis=1)  # shape (H, W)

# _DB = None
# def _get_db():
#     global _DB
#     if _DB is None:
#         _DB = ov.LoadDataset(IDX_PATH)
#     return _DB

# def weighted_global_mean(arr2d: np.ndarray) -> float:
#     a = np.asarray(arr2d, dtype=np.float64)
#     bad = (~np.isfinite(a))
#     if bad.any():
#         a = a.copy()
#         a[bad] = np.nan
#     valid = np.isfinite(a)
#     num = np.nansum(a[valid] * W2D[valid])
#     den = np.sum(W2D[valid])
#     return float(num / den) if den > 0 else np.nan

# def gmst_hours_block(t_start_inclusive: int, n_hours: int) -> list[float]:
#     db = _get_db()
#     out = []
#     for k in range(n_hours):
#         a = db.read(time=t_start_inclusive + k)
#         out.append(weighted_global_mean(a))
#     return out

# YEAR = 1945
# start_idx = hour_index_from_iso_big(f"{YEAR}-01-01 00")
# end_idx   = hour_index_from_iso_big(f"{YEAR+1}-01-01 00")
# total_hours = end_idx - start_idx

# time_index_dt = pd.date_range(f"{YEAR}-01-01 00:00:00", periods=total_hours, freq="H", tz="UTC")

# tasks = [delayed(gmst_hours_block)(start_idx + d*24, 24) for d in range(total_hours // 24)]
# blocks = dask.compute(*tasks)
# gmst_vals = np.fromiter((v for block in blocks for v in block), dtype=np.float64, count=total_hours)

# print("Annual GMST 1950:", np.nanmean(gmst_vals))


In [None]:
# client.close()