In [None]:
#This version is a full variable load version

In [None]:
#Loading in Packages and Data

#Importing Packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import ScalarFormatter
import matplotlib.gridspec as gridspec
import xarray as xr
import os; import time
import pickle
import h5py
###############################################################
def coefs(coefficients,degree):
    coef=coefficients
    coefs=""
    for n in range(degree, -1, -1):
        string=f"({coefficients[len(coef)-(n+1)]:.1e})"
        coefs+=string + f"x^{n}"
        if n != 0:
            coefs+=" + "
    return coefs
###############################################################

#Importing Model Data
check=False
dir='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'

# # dx = 1 km; Np = 1M; Nt = 5 min
# data=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_1km_1e6.nc') #***
# parcel=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_pdata_1km_1e6.nc') #***
# t_res='5min'; res='1km'
# res='1km'
# Np_str='1e6'

# dx = 1km; Np = 50M
#Importing Model Data
check=False
dir2='/home/air673/koa_scratch/'
data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_50M.nc') #***
res='1km'; t_res='1min'; Np_str='50e6'

# # dx = 1km; Np = 100M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_100M.nc') #***
# res='1km'; t_res='1min'; Np_str='100e6'


# # dx = 250 m
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_250m.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_250m.nc') #***
# res='250m'
# Np_str='150e6'

In [None]:
#INITIALIZE DATA FUNCTION
###############################################################
def initiate_array(out_file, vars, t_chunk_size, p_chunk_size, t_size=None, p_size=None):
    if t_size is None:
        t_size = len(data['time'])  # Number of timesteps
    if p_size is None:
        p_size = len(parcel['xh'])  # Number of vertical levels

    with h5py.File(out_file, 'w') as f:
        for var_name in vars:
            if var_name not in f:
                # Set dtype conditionally
                if var_name in ['Z', 'Y', 'X']:
                    dtype = np.uint16
                elif var_name in ['A_g', 'A_c','LFC','LCL']:
                    dtype = np.bool_
                else:
                    dtype = np.float32  # or whatever your default is

                f.create_dataset(
                    var_name,
                    shape=(t_size, p_size),
                    chunks=(t_chunk_size, p_chunk_size),
                    dtype=dtype
                )

In [None]:
def check_memory():
    import sys
    ipython_vars = ["In", "Out", "exit", "quit", "get_ipython", "ipython_vars"]
    print("Top 10 objects with highest memory usage")
    # Get a sorted list of the objects and their sizes
    mem = {
        key: round(value/1e6,2)
        for key, value in sorted(
            [
                (x, sys.getsizeof(globals().get(x)))
                for x in globals()
                if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
            ],
            key=lambda x: x[1],
            reverse=True)[:10]
    }
    print({key:f"{value} MB" for key,value in mem.items()})
    print(f"\n{round(sum(mem.values()),2)/1000} GB in use overall")

In [None]:
#JOB ARRAY SETUP
job_array=False
job_array=True

if job_array==True:

    num_jobs=360 #how many total jobs are being run? i.e. array=1-100 ==> num_jobs=100 #***
    total_elements=len(data['time']) #total num of variables

    if num_jobs >= total_elements:
        raise ValueError("Number of jobs cannot be greater than or equal to total elements.")
    
    job_range = total_elements // num_jobs  # Base size for each chunk
    remaining = total_elements % num_jobs   # Number of chunks with 1 extra 
    
    # Function to compute the start and end for each job_id
    def get_job_range(job_id, num_jobs):
        job_id-=1
        # Add one extra element to the first 'remaining' chunks
        start_job = job_id * job_range + min(job_id, remaining)
        end_job = start_job + job_range + (1 if job_id < remaining else 0)
    
        if job_id == num_jobs - 1: 
            end_job = total_elements #- 1
        return start_job, end_job
    # def job_testing():
    #     #TESTING
    #     start=[];end=[]
    #     for job_id in range(1,num_jobs+1):
    #         start_job, end_job = get_job_range(job_id)
    #         print(start_job,end_job)
    #         start.append(start_job)
    #         end.append(end_job)
    #     print(np.all(start!=end))
    #     print(len(np.unique(start))==len(start))
    #     print(len(np.unique(end))==len(end))
    # job_testing()
    
    job_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0)) #this is the current SBATCH job id
    if job_id==0: job_id=1
    start_job, end_job = get_job_range(job_id, num_jobs)
    index_adjust=start_job
    print(f'start_job = {start_job}, end_job = {end_job}')

In [None]:
if job_array==True:
    #Indexing Array with JobArray
    data=data.isel(time=slice(start_job,end_job))
    parcel=parcel.isel(time=slice(start_job,end_job))
    #(for 150_000_000 parcels use 500-1000 jobs)

if job_array==False:
    start_job=0;end_job=len(data['time']);index_adjust=0

In [None]:
###########################################################################################################################################################################

In [None]:
#LOADING VARIABLES
###############################################################

In [None]:
# Loading Important Variables
##############
if 'emptylike' not in globals():
    print('loading neccessary variables')
    variable='lfc'; LFC_data=data[variable].data #get w data
    variable='lcl'; LCL_data=data[variable].data #get w data
    print('done')
    empty_like=True 

In [None]:
# Reading Back Data Later
##############
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
in_file=dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5'
with h5py.File(in_file, 'r') as f:
    parcel_z = f['z'][start_job:end_job]
    
    Z = f['Z'][start_job:end_job]
    Y = f['Y'][start_job:end_job]
    X = f['X'][start_job:end_job]

# #Making Time Matrix
# rows, cols = A.shape[0], A.shape[1]
# T = np.arange(rows).reshape(-1, 1) * np.ones((1, cols), dtype=int)

In [None]:
#MAKING LAGRANGIAN BINARY ARRAY
###############################################################

In [None]:
# import tracemalloc
# tracemalloc.start()

LFC=np.zeros_like(Z,dtype='bool')
LCL=np.zeros_like(Z,dtype='bool')

Nt=len(data['time'])
Np=len(parcel['xh'])
for p in np.arange(Np):
    if np.mod(p,1e6)==0: print(f"{p}/{len(parcel['xh'])}")

    #Get Indicies
    zs=parcel_z[:,p]
    ys=Y[:,p]
    xs=X[:,p]
    ts = np.arange(Nt)  

    #Get Values
    lfcs = LFC_data[ts, ys, xs]
    lcls = LCL_data[ts, ys, xs]
    
    LFC[:,p]=(zs>=lfcs)#*1
    LCL[:,p]=(zs>=lcls)#*1

# current, peak = tracemalloc.get_traced_memory()
# print(f"Current memory usage: {current / 1024} KB; Peak memory usage: {peak / 1024} KB")
# tracemalloc.stop()

In [None]:
check_memory()

In [None]:
# Saving Data
##############
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
out_file=dir2+f'LFC_LCL_binary_array_{res}_{t_res}_{Np_str}'
if job_array==True:
    out_file+=f'_{job_id}.h5'
elif job_array==False:
    out_file+=f'.h5'

vars=['LFC','LCL']
initiate_array(out_file,vars,t_chunk_size=1,p_chunk_size=500_000)
with h5py.File(out_file, 'a') as f:
    # Save the array as a variable in the file
    f['LFC'][:]=LFC #binary array for general updraft (w>=0.1)
    f['LCL'][:]=LCL #binary array for general updraft (w>=0.5 & qc+qi>=1e-6)

In [None]:
#########################################
#RECOMBINE SEPERATE JOB_ARRAYS AFTER
recombine=False #KEEP FALSE WHEN JOB ARRAY IS RUNNING
# recombine=True

In [None]:
%%time
if recombine==True:
    dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
    dir3=dir+'Project_Algorithms/Lagrangian_Arrays/'
    out_file=dir3+f'LFC_LCL_binary_array_{res}_{t_res}_{Np_str}.h5'
    
    vars=['LFC','LCL']
    initiate_array(out_file,vars,t_chunk_size=100,p_chunk_size=500_000)
    
    with h5py.File(out_file, 'r+') as f_out:
        num_jobs=360
        for job_id in np.arange(1,num_jobs+1):
            if np.mod(job_id,5)==0: print(f"job_id = {job_id}")
            [a,b] = get_job_range(job_id,num_jobs)
     
            in_file=dir2+f'LFC_LCL_binary_array_{res}_{t_res}_{Np_str}_{job_id}.h5' 
            with h5py.File(in_file, 'r') as f_in: 
                for var in vars:
                    f_out[var][a:b]=f_in[var][:]

In [None]:
import glob
import re
#DASK-ENABLED
def recombine_func(in_files,out_file):
    # matching_files = sorted(glob.glob(in_files))
    matching_files = sorted(
    glob.glob(in_files),
    key=lambda f: int(re.search(r'_(\d+)\.h5$', f).group(1))
)
    print('recombining')
    from dask.diagnostics import ProgressBar
    out=xr.open_mfdataset(matching_files,engine='h5netcdf',concat_dim='phony_dim_0',combine='nested',phony_dims='sort')
    with ProgressBar():
        out.to_netcdf(out_file, engine='h5netcdf')
    
if recombine==True:
    dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
    dir3=dir+'Project_Algorithms/Lagrangian_Arrays/'
    in_files = dir2 + f'LFC_LCL_binary_array_{res}_{t_res}_{Np_str}_*.h5'
    out_file=dir3+f'LFC_LCL_binary_array_{res}_{t_res}_{Np_str}.h5'
    
    recombine_func(in_files,out_file)

In [None]:
#########################################

In [None]:
# # Reading Back Data Later
# ##############
# import h5py
# dir2=dir+'Project_Algorithms/Lagrangian_Arrays/job_out/'
# open_file=dir2+f'LFC_LCL_binary_array_{res}_{t_res}_{Np_str}_{job_id}.h5'
# with h5py.File(open_file, 'r') as f:
#     # Load the dataset by its name
#     LFC = f['LFC'][start_job:end_job]
#     LCL = f['LCL'][start_job:end_job]