In [None]:
#Loading in Packages and Data

#Importing Packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import ScalarFormatter
import matplotlib.gridspec as gridspec
import xarray as xr
import os; import time
import pickle
import h5py
###############################################################
def coefs(coefficients,degree):
    coef=coefficients
    coefs=""
    for n in range(degree, -1, -1):
        string=f"({coefficients[len(coef)-(n+1)]:.1e})"
        coefs+=string + f"x^{n}"
        if n != 0:
            coefs+=" + "
    return coefs
###############################################################

#Importing Model Data
check=False
dir='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'

# # dx = 1 km; Np = 1M; Nt = 5 min
# data=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_1km_5min.nc') #***
# parcel=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_pdata_1km_5min_1e6.nc') #***
# res='1km'
# Np_str='1e6'

# dx = 1km; Np = 50M
#Importing Model Data
check=False
dir2='/home/air673/koa_scratch/'
data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_50M.nc') #***
res='1km'; t_res='1min'; Np_str='50e6'

# # dx = 1km; Np = 100M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_100M.nc') #***
# res='1km'; t_res='1min'; Np_str='100e6'


# dx = 250 m
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_250m.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_250m.nc') #***

In [None]:
import sys
dir2='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'
path=dir2+'../Functions/'
sys.path.append(path)

import NumericalFunctions
from NumericalFunctions import * # import NumericalFunctions 
import PlottingFunctions
from PlottingFunctions import * # import PlottingFunctions


# # Get all functions in NumericalFunctions
# import inspect
# functions = [f[0] for f in inspect.getmembers(NumericalFunctions, inspect.isfunction)]
# functions

In [None]:
#INITIALIZE DATA FUNCTION
###############################################################
def initiate_array(out_file,vars,t_chunk_size,z_chunk_size,t_size=None,z_size=None):
    # Define array dimensions (adjust based on your data)

    if t_size==None:
        t_size = len(data['time'])  # Number of timesteps
    if z_size==None:
        z_size = len(data['zh'])    # Number of vertical levels
    
    with h5py.File(out_file, 'w') as f: 
        # Check if the dataset 'theta_e' already exists
        for var_name in vars:
            if var_name not in f:
                # Create a dataset with the full size for all time steps (initially empty)
                f.create_dataset(var_name, 
                                 (t_size, z_size),  # Full size for all timesteps
                                 chunks=(t_chunk_size, z_chunk_size))  # Chunks for time axis to allow resizing

In [None]:
#JOB ARRAY SETUP
job_array=False
job_array=True

if job_array==True:

    num_jobs=180 #how many total jobs are being run? i.e. array=1-100 ==> num_jobs=100 #***
    total_elements=len(data['time']) #total num of variables

    if num_jobs >= total_elements:
        raise ValueError("Number of jobs cannot be greater than or equal to total elements.")
    
    job_range = total_elements // num_jobs  # Base size for each chunk
    remaining = total_elements % num_jobs   # Number of chunks with 1 extra 
    
    # Function to compute the start and end for each job_id
    def get_job_range(job_id, num_jobs):
        job_id-=1
        # Add one extra element to the first 'remaining' chunks
        start_job = job_id * job_range + min(job_id, remaining)
        end_job = start_job + job_range + (1 if job_id < remaining else 0)
    
        if job_id == num_jobs - 1: 
            end_job = total_elements #- 1
        return start_job, end_job
    # def job_testing():
    #     #TESTING
    #     start=[];end=[]
    #     for job_id in range(1,num_jobs+1):
    #         start_job, end_job = get_job_range(job_id)
    #         print(start_job,end_job)
    #         start.append(start_job)
    #         end.append(end_job)
    #     print(np.all(start!=end))
    #     print(len(np.unique(start))==len(start))
    #     print(len(np.unique(end))==len(end))
    # job_testing()
    
    job_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0)) #this is the current SBATCH job id
    if job_id==0: job_id=135
    start_job, end_job = get_job_range(job_id, num_jobs)
    index_adjust=start_job
    print(f'start_job = {start_job}, end_job = {end_job}')

In [None]:
if job_array==True:
    #Indexing Array with JobArray
    data=data.isel(time=slice(start_job,end_job))
    parcel=parcel.isel(time=slice(start_job,end_job))
    #(for 150_000_000 parcels use 500-1000 jobs)

In [None]:
# Reading Back Data Later
##############
def make_data_dict(in_file,var_names,read_type):
    if read_type=='h5py':
        with h5py.File(in_file, 'r') as f:
            if job_array==True:
                data_dict = {var_name: f[var_name][start_job:end_job] for var_name in var_names}
            elif job_array==False:
                data_dict = {var_name: f[var_name][:] for var_name in var_names}
            
    elif read_type=='xarray':
        in_data = xr.open_dataset(
            in_file,
            engine='h5netcdf',
            phony_dims='sort',
            chunks={'phony_dim_0': 100, 'phony_dim_1': 1_000_000} 
        )
        if job_array==True:
            data_dict = {k: in_data[k][start_job:end_job].compute().data for k in var_names}
        elif job_array==False:
            data_dict = {k: in_data[k][:].compute().data for k in var_names}
    return data_dict
# read_type='xarray'
read_type='h5py'

In [None]:
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
in_file=dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5'

var_names = ['A_g', 'A_c', 'W', 'Z', 'Y', 'X']
data_dict = make_data_dict(in_file,var_names,read_type)
A_g, A_c, W, Z, Y, X = (data_dict[k] for k in var_names)

# #Making Time Matrix
Nt=len(data['time'])
T = np.broadcast_to(np.arange(Nt)[:, None], A_c.shape)  # shape: (Nt, p)

check_memory(globals())

In [None]:
#READING BACK IN
dir2=dir+'Project_Algorithms/Entrainment/'
in_file=dir2+f'processed_binary_arrays_{res}_{t_res}_{Np_str}.h5'

var_names = ['A_g_Processed', 'A_c_Processed']
data_dict = make_data_dict(in_file,var_names,read_type)
A_g_Processed, A_c_Processed = (data_dict[k] for k in var_names)
check_memory(globals())

In [None]:
#############################################################################
#############################################################################

In [None]:
def VMF2d(A, T, Z):
    start_time = time.time()
    """
    Function to compute 2D Mass Flux and update result array based on provided inputs.
    
    Returns a 2D (t,z) array containing the sum of the D array represented by parcels in cloudy updrafts by 1.
    The finally array is then ordered by the appropiate index using the np.add.at function.
    
    Parameters:
    - A: The (t,p) lagrangian binary array.
    - T: The (t,p) lagrangian time index array.
    - Z: The (t,p) Lagrangian z index array.

    """
    # Compute the difference between neighboring elements along the first axis
    D = A * W
    
    # # Update D for entrainment/detrainment
    # if type=='e':
    #     D[D <= 0] = 0
    # elif type=='d':
    #     D[D >= 0] = 0
    
    # Initialize time and vertical dimension arrays
    Nt = len(data['time']); Nz = len(data['zh'])
    
    # Initialize result array
    result = np.zeros((Nt, Nz))
    
    # Use np.add.at to accumulate values in the result array
    np.add.at(result, (T, Z), D)
    
    end_time = time.time()
    print(f"Execution time: {(end_time - start_time)} seconds")
    return result

In [None]:
# #TESTING TESTING TESTING
# from collections import defaultdict
# def find_repeated_columns(arr):
#     """
#     Given a 2D array of shape (4, N), find indices of columns that repeat (are identical).
#     Returns a list of index arrays, where each array contains indices of repeated columns.
#     """
#     arr_T = arr.T  # shape becomes (N, 4)
    
#     # Dictionary to store unique rows and their corresponding indices
#     row_dict = defaultdict(list)
    
#     for idx, row in enumerate(arr_T):
#         row_tuple = tuple(row)
#         row_dict[row_tuple].append(idx)
    
#     # Return only groups with repeated columns (length >= 2)
#     repeated_indices = [np.array(indices) for indices in row_dict.values() if len(indices) > 1]
    
#     return repeated_indices

# # Example usage
# arr1 = np.array([1, 2,2, 3, 4, 4, 5, 6,6,6])
# arr2 = np.array([1, 3,3, 3, 4, 4, 5, 6,6,6])
# arr3 = np.array([1, 4,4, 3, 4, 4, 5, 6,6,6])
# arr4 = np.array([1, 5,5, 3, 4, 4, 5, 6,6,6])
# arr = np.array([arr1, arr2, arr3, arr4])

# result = find_repeated_columns(arr)
# print(result)


In [None]:
# #TESTING TESTING TESTING #*#*
# where=np.where(A==1)
# arr1=T[where]
# arr2=Z[where]
# arr3=Y[where]
# arr4=X[where] 
# arrs=np.array([arr2,arr3,arr4]) 
# arrs

# out=find_repeated_columns(arrs)
# print(out)

# # one=where[0][1],where[1][1]
# # two=where[0][2],where[1][2]
# # one,two
# # T[one],Z[one],Y[one],X[one]
# # T[two],Z[two],Y[two],X[two]

In [None]:
# #TESTING TESTING TESTING
# def apply_function(A):
#     # A1=A.copy() #TESTING

#     where=np.where(A==1)
#     arr1=T[where]
#     arr2=Z[where]
#     arr3=Y[where]
#     arr4=X[where] 
#     arr=np.array([arr1,arr2,arr3,arr4]) 

#     out = find_repeated_columns(arr)
#     # print(out)

#     for ind in np.arange(len(out)):
#         extras=out[ind][1:]
#         A[(where[0][extras],where[1][extras])]=0

#     # A2=A.copy() #TESTING
#     # print(np.all(A1!=A2)) #TESTING

#     #TESTING 
#     # for k in range(len(out)):
#     #     ind1=out[k][0]
#     #     ind2=out[k][1]
#     #     print((arr1[ind1],arr2[ind1],arr3[ind1],arr4[ind1])==(arr1[ind2],arr2[ind2],arr3[ind2],arr4[ind2]))
#     return A

# # apply_function(A) #TESTING

In [None]:
#TURN PROCESSING ON OR OFF
PROCESSING=False
PROCESSING=True

# Set A based on PROCESSING state
print('Calculating 2D VMF for General Updrafts')
A = A_g if (PROCESSING==False) else A_g_Processed
# A=apply_function(A) #TESTING TESTING TESTING
profile_array_VMF_g = VMF2d(A, T, Z)


# Set A for the second block
print('Calculating 2D VMF for Cloudy Updrafts')
A = A_c if (PROCESSING==False) else A_c_Processed
# A=apply_function(A) #TESTING TESTING TESTING
profile_array_VMF_c = VMF2d(A, T, Z)

In [None]:
if job_array==True:
    dir2=dir+'Project_Algorithms/Entrainment/job_out_4/'
elif job_array==False:
    dir2=dir+'Project_Algorithms/Entrainment/'

#SAVING
if PROCESSING==False:
    out_file=dir2+f'2D_VMF_profiles_{res}_{t_res}_{Np_str}'
elif PROCESSING==True:
    out_file=dir2+f'2D_VMF_profiles_PREPROCESSING_{res}_{t_res}_{Np_str}'
    
if job_array==True:
    out_file+=f'_{job_id}.h5'
elif job_array==False:
    out_file+=f'.h5'

vars=["profile_array_VMF_g","profile_array_VMF_c"]
initiate_array(out_file,vars,t_chunk_size=1,z_chunk_size=17)

with h5py.File(out_file, 'a') as f: 
    f['profile_array_VMF_g'][:]=profile_array_VMF_g
    f['profile_array_VMF_c'][:]=profile_array_VMF_c
print('done')

In [None]:
check_memory(globals())

In [None]:
#########################################
#RECOMBINE SEPERATE JOB_ARRAYS AFTER
recombine=False #KEEP FALSE DURING JOB ARRAY RUN
# recombine=True

In [None]:
if recombine==True:
    PROCESSING=False
    PROCESSING=True
    
    dir2=dir+'Project_Algorithms/Entrainment/job_out_4/'
    dir3=dir+'Project_Algorithms/Entrainment/'
    
    if PROCESSING==False:
        out_file=dir3+f'2D_VMF_profiles_{res}_{t_res}_{Np_str}.h5'
    elif PROCESSING==True:
        out_file=dir3+f'2D_VMF_profiles_PREPROCESSING_{res}_{t_res}_{Np_str}.h5'
    
    vars=["profile_array_VMF_g","profile_array_VMF_c"]
    initiate_array(out_file,vars,t_chunk_size=50,z_chunk_size=17)
    
    with h5py.File(out_file, 'r+') as f_out:
        num_jobs=180
        for job_id in np.arange(1,num_jobs+1):
            if np.mod(job_id,5)==0: print(f"job_id = {job_id}")
            [a,b] = get_job_range(job_id,num_jobs)
    
            if PROCESSING==False:
                in_file=dir2+f'2D_VMF_profiles_{res}_{t_res}_{Np_str}_{job_id}.h5'
            elif PROCESSING==True:
                in_file=dir2+f'2D_VMF_profiles_PREPROCESSING_{res}_{t_res}_{Np_str}_{job_id}.h5'
            with h5py.File(in_file, 'r') as f_in: 
                for var in vars:
                    f_out[var][a:b]=f_in[var][:]