In [None]:
#Loading in Packages and Data

#Importing Packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import ScalarFormatter
import matplotlib.gridspec as gridspec
import xarray as xr
import os; import time
import pickle
import h5py
###############################################################
def coefs(coefficients,degree):
    coef=coefficients
    coefs=""
    for n in range(degree, -1, -1):
        string=f"({coefficients[len(coef)-(n+1)]:.1e})"
        coefs+=string + f"x^{n}"
        if n != 0:
            coefs+=" + "
    return coefs
###############################################################

#Importing Model Data
check=False
dir='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'

# dx = 1 km; Np = 1M; Nt = 5 min
data=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_1km_5min.nc') #***
parcel=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_pdata_1km_5min_1e6.nc') #***
t_res='5min'; res='1km'; Np_str='1e6'

# # dx = 1km; Np = 50M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_50M.nc') #***
# res='1km'; t_res='1min'; Np_str='50e6'

# # dx = 1km; Np = 100M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_100M.nc') #***
# res='1km'; t_res='1min'; Np_str='100e6'

# # dx = 250 m
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_250m.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_250m.nc') #***
# res='250m'
# Np_str='150e6'

In [None]:
#INITIALIZE DATA FUNCTION
###############################################################
def initiate_array(out_file, vars, t_chunk_size, p_chunk_size, t_size=None, p_size=None):
    if t_size is None:
        t_size = len(data['time'])  # Number of timesteps
    if p_size is None:
        p_size = len(parcel['xh'])  # Number of vertical levels

    with h5py.File(out_file, 'w') as f:
        for var_name in vars:
            if var_name not in f:
                # Set dtype conditionally
                if var_name in ['Z', 'Y', 'X']:
                    dtype = np.uint16
                elif var_name in ['A_g', 'A_c']:
                    dtype = np.bool_
                else:
                    dtype = np.float32  # or whatever your default is

                f.create_dataset(
                    var_name,
                    shape=(t_size, p_size),
                    chunks=(t_chunk_size, p_chunk_size),
                    dtype=dtype
                )

In [None]:
def check_memory():
    import sys
    ipython_vars = ["In", "Out", "exit", "quit", "get_ipython", "ipython_vars"]
    print("Top 10 objects with highest memory usage")
    # Get a sorted list of the objects and their sizes
    mem = {
        key: round(value/1e6,2)
        for key, value in sorted(
            [
                (x, sys.getsizeof(globals().get(x)))
                for x in globals()
                if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
            ],
            key=lambda x: x[1],
            reverse=True)[:10]
    }
    print({key:f"{value} MB" for key,value in mem.items()})
    print(f"\n{round(sum(mem.values()),2)/1000} GB in use overall")

In [None]:
#JOB ARRAY SETUP
job_array=False
job_array=True

if job_array==True:

    num_jobs=3 #120 #how many total jobs are being run? i.e. array=1-100 ==> num_jobs=100 #***
    total_elements=len(data['time']) #total num of variables

    if num_jobs >= total_elements:
        raise ValueError("Number of jobs cannot be greater than or equal to total elements.")

    
    job_range = total_elements // num_jobs  # Base size for each chunk
    remaining = total_elements % num_jobs   # Number of chunks with 1 extra 
    
    # Function to compute the start and end for each job_id
    def get_job_range(job_id, num_jobs):
        job_id-=1
        # Add one extra element to the first 'remaining' chunks
        start_job = job_id * job_range + min(job_id, remaining)
        end_job = start_job + job_range + (1 if job_id < remaining else 0)
    
        if job_id == num_jobs - 1: 
            end_job = total_elements #- 1
        return start_job, end_job
    # def job_testing():
    #     #TESTING
    #     start=[];end=[]
    #     for job_id in range(1,num_jobs+1):
    #         start_job, end_job = get_job_range(job_id, num_jobs)
    #         print(start_job,end_job)
    #         start.append(start_job)
    #         end.append(end_job)
    #     print(np.all(start!=end))
    #     print(len(np.unique(start))==len(start))
    #     print(len(np.unique(end))==len(end))
    # job_testing()
    
    job_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0)) #this is the current SBATCH job id
    if job_id==0: job_id=3
    start_job, end_job = get_job_range(job_id, num_jobs)
    index_adjust=start_job
    print(f'start_job = {start_job}, end_job = {end_job}')

In [None]:
if job_array==True:
    #Indexing Array with JobArray
    data=data.isel(time=slice(start_job,end_job))
    parcel=parcel.isel(time=slice(start_job,end_job))
    #(for 150_000_000 parcels use 500-1000 jobs)

if job_array==False:
    start_job=0;end_job=len(data['time']);index_adjust=0

In [None]:
###########################################################################################################################################################################

In [None]:
#LOADING VARIABLES
###############################################################

In [None]:
# Reading Back Data Later
##############
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
open_file=dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5'
with h5py.File(open_file, 'r') as f:
    Z = f['Z'][start_job:end_job]
    Y = f['Y'][start_job:end_job]
    X = f['X'][start_job:end_job]

# #Making Time Matrix
# rows, cols = A.shape[0], A.shape[1]
# T = np.arange(rows).reshape(-1, 1) * np.ones((1, cols), dtype=int)
check_memory()

In [None]:
def call_variable(varname):
    if varname=='th_e':
        with h5py.File(dir + 'Variable_Calculation/' + 'theta_e'+f'_{res}_{t_res}'+'.h5', 'r') as f:
            var_data = f['theta_e'][start_job:end_job]
    elif varname=='HMC':
        dir2='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'
        file_path = dir2 + 'Variable_Calculation/' + '2D_Moisture_Convergence' + f'_{res}_{t_res}' + '.h5'
        with h5py.File(file_path, 'r') as f:
            var_data = f['conv'][start_job:end_job]
    elif varname=='VMF_c':
        dir2 = '/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'
        dir3 = dir2 + 'Project_Algorithms/Entrainment/'
        with h5py.File(dir3 + '3D_Eulerian_VMF'+f'_{res}_{t_res}'+'.h5', 'r') as f:
            var_data = f['VMF_c'][start_job:end_job]
    elif varname=='VMF_g':
        dir2 = '/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'
        dir3 = dir2 + 'Project_Algorithms/Entrainment/'
        with h5py.File(dir3 + '3D_Eulerian_VMF'+f'_{res}_{t_res}'+'.h5', 'r') as f:
            var_data = f['VMF_g'][start_job:end_job]
    else:
        var_data=data[varname].data
    return var_data

In [None]:
def make_lagrangian_array(varnames):
    # Initialize dictionaries
    var_data_dict = {varname: call_variable(varname) for varname in varnames}
    VAR = {varname: np.zeros_like(Z, dtype='float32') for varname in varnames}

    Nt = len(data['time'])
    Np = len(parcel['xh'])
    for p in np.arange(Np):

        if np.mod(p, 5e4) == 0: 
            print(f"{p}/{len(parcel['xh'])}")

        # Get Indices
        zs = Z[:, p]
        ys = Y[:, p]
        xs = X[:, p]
        ts = np.arange(Nt)

        # Loop over all variables and fill the respective VAR array
        for varname, var_data in var_data_dict.items():
            VAR[varname][:, p] = var_data[ts, zs, ys, xs]

    # Return all the arrays in a list
    return [VAR[varname] for varname in varnames]

In [None]:
#RUNNING
varnames=['qv','th','th_e','buoyancy','HMC','VMF_c','VMF_g']#,'qi','qr']
[QV,TH,TH_E,BUOYANCY,HMC,VMF_c,VMF_g]=make_lagrangian_array(varnames); #,QI,QR]
check_memory()  

In [None]:
# Saving Data
##############
print('Saving Data\n')
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
out_file=dir2+f'VARS_binary_array_{res}_{t_res}_{Np_str}'
if job_array==True:
    out_file+=f'_{job_id}.h5'
elif job_array==False:
    out_file+=f'.h5'

vars=['QV','TH','TH_E','BUOYANCY','HMC','VMF_c','VMF_g']#,'QI','QR']
if Np_str=='1e6':
    initiate_array(out_file,vars,t_chunk_size=1,p_chunk_size=100_000)
elif Np_str=='50e6':
    initiate_array(out_file,vars,t_chunk_size=1,p_chunk_size=500_000)

with h5py.File(out_file, 'a') as f: 
    f['QV'][:]=QV
    
    f['TH'][:]=TH
    f['TH_E'][:]=TH_E
    
    f['BUOYANCY'][:]=BUOYANCY
    
    f['HMC'][:]=HMC
    
    f['VMF_c'][:]=VMF_c

    f['VMF_g'][:]=VMF_g

    # f['QI'][:]=QI
    # f['QR'][:]=QR

In [None]:
#########################################
#RECOMBINE SEPERATE JOB_ARRAYS AFTER
recombine=False #KEEP FALSE WHEN JOB ARRAY IS RUNNING
# recombine=True

In [None]:
# #OLD VERSION
# if recombine==True:
#     dir3=dir+'Project_Algorithms/Lagrangian_Arrays/' 
#     out_file=dir3+f'VARS_binary_array_{res}_{t_res}_{Np_str}.h5' 
    
#     vars=['QV','TH','TH_E','BUOYANCY','HMC','VMF_c','VMF_g']

#     if t_res=='5min':
#         initiate_array(out_file,vars,t_chunk_size=20,p_chunk_size=500_000)
#     elif t_res=='1min':
#         initiate_array(out_file,vars,t_chunk_size=100,p_chunk_size=500_000)

#     print('combining')
#     with h5py.File(out_file, 'r+') as f_out:
#         num_jobs=3
#         for job_id in np.arange(1,num_jobs+1):
#             if np.mod(job_id,5)==0: print(f"job_id = {job_id}")
#             [a,b] = get_job_range(job_id,num_jobs)

#             dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
#             in_file=dir2+f'VARS_binary_array_{res}_{t_res}_{Np_str}_{job_id}.h5' 
#             with h5py.File(in_file, 'r') as f_in: 
#                 for var in vars:
#                     f_out[var][a:b]=f_in[var][:]

In [None]:
if recombine==True:
    #DASK-ENABLED
    import glob
    import re
    def recombine_func(in_files,out_file):
        # matching_files = sorted(glob.glob(in_files))
        matching_files = sorted(
        glob.glob(in_files),
        key=lambda f: int(re.search(r'_(\d+)\.h5$', f).group(1))
    )
        print('recombining')
        from dask.diagnostics import ProgressBar
        out=xr.open_mfdataset(matching_files,engine='h5netcdf',concat_dim='phony_dim_0',combine='nested',phony_dims='sort')
        with ProgressBar():
            out.to_netcdf(out_file, engine='h5netcdf')
        
    if recombine==True:
        dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
        dir3=dir+'Project_Algorithms/Lagrangian_Arrays/'
        in_files = dir2 + f'VARS_binary_array_{res}_{t_res}_{Np_str}_*.h5'
        out_file=dir3+f'VARS_binary_array_{res}_{t_res}_{Np_str}.h5' 
        
        recombine_func(in_files,out_file)

In [None]:
#########################################

In [None]:
# # Reading Back Data Later
# ##############
# import h5py
# dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
# with h5py.File(dir2+f'VARS_binary_array_{res}_{t_res}_{Np_str}.h5', 'r') as f:
#     # Load the dataset by its name
#     QV = f['QV'][:]
#     TH = f['TH'][:]
#     TH_E = f['TH_E'][:]
#     BUOYANCY = f['BUOYANCY'][:]
#     HMC = f['HMC'][:]
#     VMF_c = f['VMF_c'][:]
# check_memory()

In [None]:
##############################
#TESTING

In [None]:
##########################

In [None]:
# #INDIVIDUAL JOBS
# num_jobs=120
# job_id=2
# [start_job,end_job]=get_job_range(job_id, num_jobs)
# data2=data.isel(time=slice(start_job,end_job))

# dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
# open_file=dir2+f'VARS_binary_array_{res}_{t_res}_{Np_str}_{job_id}.h5'
# with h5py.File(open_file, 'r') as f:
#     # Load the dataset by its name
#     QV = f['QV'][:]
#     TH = f['TH'][:]
#     TH_E = f['TH_E'][:]
#     BUOYANCY = f['BUOYANCY'][:]
#     HMC = f['HMC'][:]
#     VMF_c = f['VMF_c'][:]
# check_memory()

In [None]:
# dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
# with h5py.File(dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5', 'r') as f:
#     # Load the dataset by its name
#     Z = f['Z'][start_job:end_job]
#     Y = f['Y'][start_job:end_job]
#     X = f['X'][start_job:end_job]

In [None]:
# t,p=0,1000
# def test(t,p,VAR,var):
#     z=Z[t,p];y=Y[t,p];x=X[t,p]
#     out=data2[var].isel(time=t,zh=z,yh=y,xh=x).data
#     print(VAR[t,p],out)

# test(t,p,QV,'qv')
# test(t,p,TH,'th')
# test(t,p,BUOYANCY,'buoyancy')

In [None]:
#TESTING
# start_job=int(500/5);end_job=int(505/5)
start_job=500;end_job=502
data2=data.isel(time=slice(start_job,end_job))

dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
open_file=dir2+f'VARS_binary_array_{res}_{t_res}_{Np_str}.h5'
with h5py.File(open_file, 'r') as f:
    # Load the dataset by its name
    QV = f['QV'][start_job:end_job]
    TH = f['TH'][start_job:end_job]
    TH_E = f['TH_E'][start_job:end_job]
    BUOYANCY = f['BUOYANCY'][start_job:end_job]
    HMC = f['HMC'][start_job:end_job]
    VMF_c = f['VMF_c'][start_job:end_job]
check_memory()

In [None]:
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
with h5py.File(dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5', 'r') as f:
    # Load the dataset by its name
    Z = f['Z'][start_job:end_job]
    Y = f['Y'][start_job:end_job]
    X = f['X'][start_job:end_job]

In [None]:
t,p=0,1000
def test(t,p,VAR,var):
    z=Z[t,p];y=Y[t,p];x=X[t,p]
    out=data2[var].isel(time=t,zh=z,yh=y,xh=x).data
    print(VAR[t,p],out)

test(t,p,QV,'qv')
test(t,p,TH,'th')
test(t,p,BUOYANCY,'buoyancy')