In [None]:
#Loading in Packages and Data

#Importing Packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import ScalarFormatter
import matplotlib.gridspec as gridspec
import xarray as xr
import os; import time
import pickle
import h5py
###############################################################
def coefs(coefficients,degree):
    coef=coefficients
    coefs=""
    for n in range(degree, -1, -1):
        string=f"({coefficients[len(coef)-(n+1)]:.1e})"
        coefs+=string + f"x^{n}"
        if n != 0:
            coefs+=" + "
    return coefs
###############################################################

#Importing Model Data
check=False
dir='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'

# # dx = 1 km; Np = 1M; Nt = 5 min
# data=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_1km_1e6.nc') #***
# parcel=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_pdata_1km_1e6.nc') #***
# t_res='5min'; res='1km'
# res='1km'
# Np_str='1e6'

# # dx = 1 km; Np = 1M; Nt = 1 min
# data=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_1km_1e6_5min.nc') #***
# parcel=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_pdata_1km_1e6_5min.nc') #***
# t_res='5min'; res='1km'
# Np_str='1e6'

# dx = 1km; Np = 50M
#Importing Model Data
check=False
dir2='/home/air673/koa_scratch/'
data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_50M.nc') #***
res='1km'; t_res='1min'; Np_str='50e6'

# # dx = 1km; Np = 100M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_100M.nc') #***
# res='1km'; t_res='1min'; Np_str='100e6'


# # dx = 250 m
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_250m.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_250m.nc') #***
# res='250m'
# Np_str='150e6'

In [None]:
#INITIALIZE DATA FUNCTION
###############################################################
def initiate_array(out_file, vars, t_chunk_size, p_chunk_size, t_size=None, p_size=None):
    if t_size is None:
        t_size = len(data['time'])  # Number of timesteps
    if p_size is None:
        p_size = len(parcel['xh'])  # Number of vertical levels

    with h5py.File(out_file, 'w') as f:
        for var_name in vars:
            if var_name not in f:
                # Set dtype conditionally
                if var_name in ['Z', 'Y', 'X']:
                    dtype = np.uint16
                elif var_name in ['A_g', 'A_c']:
                    dtype = np.bool_
                else:
                    dtype = np.float32  # or whatever your default is

                f.create_dataset(
                    var_name,
                    shape=(t_size, p_size),
                    chunks=(t_chunk_size, p_chunk_size),
                    dtype=dtype
                )

In [None]:
def check_memory():
    import sys
    ipython_vars = ["In", "Out", "exit", "quit", "get_ipython", "ipython_vars"]
    print("Top 10 objects with highest memory usage")
    # Get a sorted list of the objects and their sizes
    mem = {
        key: round(value/1e6,2)
        for key, value in sorted(
            [
                (x, sys.getsizeof(globals().get(x)))
                for x in globals()
                if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
            ],
            key=lambda x: x[1],
            reverse=True)[:10]
    }
    print({key:f"{value} MB" for key,value in mem.items()})
    print(f"\n{round(sum(mem.values()),2)/1000} GB in use overall")

In [None]:
#JOB ARRAY SETUP
job_array=False
job_array=True

if job_array==True:

    num_jobs=360 #how many total jobs are being run? i.e. array=1-100 ==> num_jobs=100 #***
    total_elements=len(data['time']) #total num of variables

    if num_jobs >= total_elements:
        raise ValueError("Number of jobs cannot be greater than or equal to total elements.")
    
    job_range = total_elements // num_jobs  # Base size for each chunk
    remaining = total_elements % num_jobs   # Number of chunks with 1 extra 
    
    # Function to compute the start and end for each job_id
    def get_job_range(job_id, num_jobs):
        job_id-=1
        # Add one extra element to the first 'remaining' chunks
        start_job = job_id * job_range + min(job_id, remaining)
        end_job = start_job + job_range + (1 if job_id < remaining else 0)
    
        if job_id == num_jobs - 1: 
            end_job = total_elements #- 1
        return start_job, end_job
    # def job_testing():
    #     #TESTING
    #     start=[];end=[]
    #     for job_id in range(1,num_jobs+1):
    #         start_job, end_job = get_job_range(job_id)
    #         print(start_job,end_job)
    #         start.append(start_job)
    #         end.append(end_job)
    #     print(np.all(start!=end))
    #     print(len(np.unique(start))==len(start))
    #     print(len(np.unique(end))==len(end))
    # job_testing()
    
    job_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0)) #this is the current SBATCH job id
    if job_id==0: job_id=1
    start_job, end_job = get_job_range(job_id, num_jobs)
    index_adjust=start_job
    print(f'start_job = {start_job}, end_job = {end_job}')

In [None]:
if job_array==True:
    #Indexing Array with JobArray
    data=data.isel(time=slice(start_job,end_job))
    parcel=parcel.isel(time=slice(start_job,end_job))
    #(for 150_000_000 parcels use 500-1000 jobs)

if job_array==False:
    start_job=0;end_job=len(data['time']);index_adjust=0

In [None]:
###########################################################################################################################################################################

In [None]:
#LOADING VARIABLES
###############################################################

In [None]:
# Reading Back Data Later
##############
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Binary_Array/'
open_file=dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5'
with h5py.File(open_file, 'r') as f:
    Z = f['Z'][start_job:end_job]
    Y = f['Y'][start_job:end_job]
    X = f['X'][start_job:end_job]

# #Making Time Matrix
# rows, cols = A.shape[0], A.shape[1]
# T = np.arange(rows).reshape(-1, 1) * np.ones((1, cols), dtype=int)
check_memory()

In [None]:
################################
#CALCULATION

In [None]:
#constants
Cp=1004 #Jkg-1K-1
Cv=717 #Jkg-1K-1
Rd=Cp-Cv #Jkg-1K-1
eps=0.608

Lx=(data['xf'][-1].item()-data['xf'][0].item())*1000 #x length (m)
Ly=(data['yf'][-1].item()-data['yf'][0].item())*1000 #y length (m)
Np=len(parcel['xh']) #number of lagrangian parcles
dt=(data['time'][1]-data['time'][0]).item()/1e9 #sec
dx=(data['xf'][1].item()-data['xf'][0].item())*1e3 #meters
dy=(data['yf'][1].item()-data['yf'][0].item())*1e3 #meters
xs=data['xf'].values*1000
ys=data['yf'].values*1000
zs=data['zf'].values*1000

def zf(z):
    k=z #z is the # level of z
    out=data['zf'].values[k]*1000
    return out
# def rho(x,y,z,t):
#     p=data['prs'].isel(xh=x,yh=y,zh=z,time=t).item()
#     p0=101325 #Pa
#     theta=data['th'].isel(xh=x,yh=y,zh=z,time=t).item()
#     T=theta*(p/p0)**(Rd/Cp)
#     qv=data['qv'].isel(xh=x,yh=y,zh=z,time=t).item()
#     # Tv=T*(1+eps*qv)
#     Tv=T*(eps+qv)/(eps*(1+qv))
#     rho = p/(Rd*Tv)
#     out=rho
#     return out

# rho_data=data['rho'].data
def rho(x,y,z,t):
    # out=data['rho'].isel(xh=x,yh=y,zh=z,time=t).item()
    out=rho_data[t,z,y,x]
    return out
def m(t):
    m=0
    #triple sum
    for k in range(len(data['zh'])):
        dz=(zf(k+1)-zf(k))
        for j in range(len(data['yh'])):
            for i in range(len(data['xh'])):
                rho_out=rho(i,j,k,t)
                m+=rho_out*dz
                
    #triple sum
    out=m*dx*dy/Np
    return out

In [None]:
#Calculate Mass Constant
# calculate='single_time'
calculate=False

dir3=dir+f'Project_Algorithms/Entrainment/'
if calculate==True:
    Nt=len(data['time'])
    m_arr=np.zeros((Nt))
    for t in np.arange(Nt):
        if np.mod(t,25)==0: print(t)
        # m_arr[t]=m(t) #UNCOMMENT FOR FULL CALCULATION
    np.save(dir3+'Mass_Array_5min.npy', m_arr)
    # np.save(dir3+'Mass_Array_1min.npy', m_arr)
elif calculate=='single_time':
    Nt=len(data['time'])
    m_arr=np.zeros((Nt))

    t=len(data['time'])//2
    m_300=m(t)
    for t in np.arange(Nt):
        m_arr[t]=m_300 #UNCOMMENT FOR FULL CALCULATION
    np.save(dir3+'Mass_Array_5min.npy', m_arr)
    # np.save(dir3+'Mass_Array_1min.npy', m_arr)
else:
    m_arr = np.load(dir3+'Mass_Array_5min.npy')
    # m_arr = np.load(dir3+'Mass_Array_1min.npy')

# # TESTING
# lst=[]
# for t in np.arange(133):
#     lst.append(m_arr[t])

# plt.plot(lst)
# (np.max(lst)-np.min(lst))*100/np.mean(lst)

In [None]:
def call_variable(varname):
    
    #READING BACK IN
    PROCESSING=False
    PROCESSING=True
    
    # print('loading vars')
    if PROCESSING==False:
        dir3=dir+f'Project_Algorithms/Entrainment/3D_VMF_profiles_{res}_{t_res}_{Np_str}.h5'
    if PROCESSING==True:
        dir3=dir+f'Project_Algorithms/Entrainment/3D_VMF_profiles_PREPROCESSING_{res}_{t_res}_{Np_str}.h5'
    with h5py.File(dir3, "r") as h5f:
        if varname=='VMF_g':
            profile_array = h5f["profile_array_VMF_g"][start_job:end_job]
        if varname=='VMF_c':
            profile_array = h5f["profile_array_VMF_c"][start_job:end_job]
        
    
    def apply_constant(profile_array,apply):
        if apply==True:
            Nt=profile_array.shape[0]
            Nz=profile_array.shape[1]
        
            profile_array/=(dx*dy)
            for t in np.arange(Nt):
                profile_array[t]*=m_arr[t+index_adjust]
            for z in np.arange(Nz):
                dz=zf(z+1)-zf(z)
                profile_array[:,z]/=dz
        return profile_array
    
    #APPLY CONSTANTS TO ENTRAINMENT VALUE
    ##################################################
    
    profile_array=apply_constant(profile_array,apply=True)
    ##################################################

    return profile_array

In [None]:
def make_lagrangian_array(varnames):
    # Initialize dictionaries
    var_data_dict = {varname: call_variable(varname) for varname in varnames}
    VAR = {varname: np.zeros_like(Z, dtype='float32') for varname in varnames}

    Nt = len(data['time'])
    Np = len(parcel['xh'])
    for p in np.arange(Np):
        if np.mod(p, 1e6) == 0: 
            print(f"{p}/{len(parcel['xh'])}")

        # Get Indices
        zs = Z[:, p]
        ys = Y[:, p]
        xs = X[:, p]
        ts = np.arange(Nt)

        # Loop over all variables and fill the respective VAR array
        for varname, var_data in var_data_dict.items():
            VAR[varname][:, p] = var_data[ts, zs, ys, xs]

    # Return all the arrays in a list
    return [VAR[varname] for varname in varnames]

In [None]:
print('Working on VMF Variables')
varnames=['VMF_g','VMF_c']
[VMF_G,VMF_C]=make_lagrangian_array(varnames); check_memory()

In [None]:
# Saving Data
##############
print('Saving Data\n')
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Binary_Array/job_out/'
out_file=dir2+f'VMF_binary_array_{res}_{t_res}_{Np_str}'
if job_array==True:
    out_file+=f'_{job_id}.h5'
elif job_array==False:
    out_file+=f'.h5'

vars=['VMF_G','VMF_C']
initiate_array(out_file,vars,t_chunk_size=1,p_chunk_size=100_000)
with h5py.File(out_file, 'a') as f: 
    f['VMF_G'][:]=VMF_G
    f['VMF_C'][:]=VMF_C

In [None]:
#########################################
#RECOMBINE SEPERATE JOB_ARRAYS AFTER
recombine=False #KEEP FALSE WHEN JOB ARRAY IS RUNNING
# recombine=True

In [None]:
if recombine==True:
    dir2=dir+'Project_Algorithms/Lagrangian_Binary_Array/job_out/'
    dir3=dir+'Project_Algorithms/Lagrangian_Binary_Array/'
    out_file=dir3+f'VMF_binary_array_{res}_{t_res}_{Np_str}.h5' 
    
    vars=['VMF_G','VMF_C']
    initiate_array(out_file,vars,t_chunk_size=100,p_chunk_size=100_000)
    
    with h5py.File(out_file, 'r+') as f_out:
        num_jobs=60
        for job_id in np.arange(1,num_jobs+1):
            if np.mod(job_id,5)==0: print(f"job_id = {job_id}")
            [a,b] = get_job_range(job_id,num_jobs)
    
            in_file=dir2+f'ED_binary_array_{res}_{t_res}_{Np_str}_{job_id}.h5' 
            with h5py.File(in_file, 'r') as f_in: 
                for var in vars:
                    f_out[var][a:b]=f_in[var][:]

In [None]:
#DASK-ENABLED
def recombine(in_files,out_file):
    # matching_files = sorted(glob.glob(in_files))
    matching_files = sorted(
    glob.glob(in_files),
    key=lambda f: int(re.search(r'_(\d+)\.h5$', f).group(1))
)
    print('recombining')
    # %%time
    out=xr.open_mfdataset(matching_files,engine='h5netcdf',concat_dim='phony_dim_0',combine='nested',phony_dims='sort')
    out.to_netcdf(out_file, engine='h5netcdf')
    
if recombine==True:
    import glob
    dir3=dir+'Project_Algorithms/Lagrangian_Binary_Array/'
    in_files = dir2 + f'VMF_binary_array_{res}_{t_res}_{Np_str}_*.h5'
    out_file=dir3+f'VMF_binary_array_{res}_{t_res}_{Np_str}.h5' 
    
    recombine(in_files,out_file)

In [None]:
#########################################

In [None]:
# # Reading Back Data Later
# ##############
# import h5py
# dir2=dir+'Project_Algorithms/Lagrangian_Binary_Array/'
# with h5py.File(dir2+f'ED_binary_array_{res}_{t_res}_{Np_str}.h5', 'r') as f:
#     # Load the dataset by its name
#     E_G = f['E_G'][:]
#     E_C = f['E_C'][:]
#     D_G = f['D_G'][:]
#     D_C = f['D_C'][:]
# check_memory()