In [None]:
 #Loading in Packages and Data

#Importing Packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import ScalarFormatter
import matplotlib.gridspec as gridspec
import xarray as xr
import os; import time
import pickle
import h5py
###############################################################
def coefs(coefficients,degree):
    coef=coefficients
    coefs=""
    for n in range(degree, -1, -1):
        string=f"({coefficients[len(coef)-(n+1)]:.1e})"
        coefs+=string + f"x^{n}"
        if n != 0:
            coefs+=" + "
    return coefs
###############################################################

#Importing Model Data
check=False
dir='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'

# dx = 1 km; Np = 1M; Nt = 5 min
data=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_1km_5min.nc') #***
parcel=xr.open_dataset(dir+'../cm1r20.3/run/cm1out_pdata_1km_5min_1e6.nc') #***
res='1km';t_res='5min'
Np_str='1e6'

# # dx = 1km; Np = 50M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_50M.nc') #***
# res='1km'; t_res='1min'; Np_str='50e6'

# # dx = 1km; Np = 100M
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_1km_1min.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_1km_1min_100M.nc') #***
# res='1km'; t_res='1min'; Np_str='100e6'


# dx = 250 m
# #Importing Model Data
# check=False
# dir2='/home/air673/koa_scratch/'
# data=xr.open_dataset(dir2+'cm1out_250m.nc') #***
# parcel=xr.open_dataset(dir2+'cm1out_pdata_250m.nc') #***

In [None]:
#INITIALIZE DATA FUNCTION
###############################################################
def initiate_array(out_file, vars, t_chunk_size, p_chunk_size, t_size=None, p_size=None):
    if t_size is None:
        t_size = len(data['time'])  # Number of timesteps
    if p_size is None:
        p_size = len(parcel['xh'])  # Number of vertical levels

    with h5py.File(out_file, 'w') as f:
        for var_name in vars:
            if var_name not in f:
                # Set dtype conditionally
                if var_name in ['Z', 'Y', 'X']:
                    dtype = np.uint16
                elif var_name in ['A_g','A_c','A_g_Processed','A_c_Processed']:
                    dtype = np.bool_
                else:
                    dtype = np.float32  # or whatever your default is

                f.create_dataset(
                    var_name,
                    shape=(t_size, p_size),
                    chunks=(t_chunk_size, p_chunk_size),
                    dtype=dtype
                )

In [None]:
import sys
dir2='/mnt/lustre/koa/koastore/torri_group/air_directory/DCI-Project/'
path=dir2+'../Functions/'
sys.path.append(path)

import NumericalFunctions
from NumericalFunctions import * # import NumericalFunctions 
import PlottingFunctions
from PlottingFunctions import * # import PlottingFunctions


# # Get all functions in NumericalFunctions
# import inspect
# functions = [f[0] for f in inspect.getmembers(NumericalFunctions, inspect.isfunction)]
# functions

# # Get all functions in NumericalFunctions
# import inspect
# functions = [f[0] for f in inspect.getmembers(PlottingFunctions, inspect.isfunction)]
# functions

In [None]:
#JOB ARRAY SETUP
job_array=True
if job_array==True:

    num_jobs=60 #how many total jobs are being run? i.e. array=1-100 ==> num_jobs=100 #***
    total_elements=len(parcel['xh']) #total num of variables

    if num_jobs >= total_elements:
        raise ValueError("Number of jobs cannot be greater than or equal to total elements.")
    
    job_range = total_elements // num_jobs  # Base size for each chunk
    remaining = total_elements % num_jobs   # Number of chunks with 1 extra 
    
    # Function to compute the start and end for each job_id
    def get_job_range(job_id, num_jobs):
        job_id-=1
        # Add one extra element to the first 'remaining' chunks
        start_job = job_id * job_range + min(job_id, remaining)
        end_job = start_job + job_range + (1 if job_id < remaining else 0)
    
        if job_id == num_jobs - 1: 
            end_job = total_elements #- 1
        return start_job, end_job
    # def job_testing():
    #     #TESTING
    #     start=[];end=[]
    #     for job_id in range(1,num_jobs+1):
    #         start_job, end_job = get_job_range(job_id)
    #         print(start_job,end_job)
    #         start.append(start_job)
    #         end.append(end_job)
    #     print(np.all(start!=end))
    #     print(len(np.unique(start))==len(start))
    #     print(len(np.unique(end))==len(end))
    # job_testing()
    
    job_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0)) #this is the current SBATCH job id
    if job_id==0: job_id=10
    start_job, end_job = get_job_range(job_id, num_jobs)
    index_adjust=start_job
    print(f'start_job = {start_job}, end_job = {end_job}')

In [None]:
##################################################
#LOADING IN DATA

In [None]:
#Indexing Array with JobArray
parcel=parcel.isel(xh=slice(start_job,end_job))
#(for 150_000_000 parcels use 500-1000 jobs)

In [None]:
# Reading Back Data Later
##############
def make_data_dict(var_names,read_type):
    if read_type=='h5py':
        with h5py.File(in_file, 'r') as f:
            data_dict = {var_name: f[var_name][:,start_job:end_job] for var_name in var_names}
            
    elif read_type=='xarray':
        in_data = xr.open_dataset(
            in_file,
            engine='h5netcdf',
            phony_dims='sort',
            chunks={'phony_dim_0': 100, 'phony_dim_1': 1_000_000} 
        )
        data_dict = {k: in_data[k][:,start_job:end_job].compute().data for k in var_names}
    return data_dict

# read_type='xarray'
read_type='h5py'

In [None]:
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
in_file=dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5'

var_names = ['A_g', 'A_c']
data_dict = make_data_dict(var_names,read_type)
A_g, A_c = (data_dict[k] for k in var_names)

# #Making Time Matrix
Nt=len(data['time'])
T = np.broadcast_to(np.arange(Nt)[:, None], A_c.shape)  # shape: (Nt, p)

check_memory(globals())

In [None]:
###########################################################################################################################################################################

In [None]:
def extend_idxs(f,case):
    out=np.sort(np.add.outer(f, np.arange(case)).ravel())

    # #OLD METHOD (SLOW)
    # if np.any(f)==True:
    #     out=np.sort(np.concatenate([np.arange(idx, idx + case-1+1) for idx in f]))
    # else: 
    #     out=f
    return out

def find_sandwiched_patterns(changes, case):
    arr=changes
    
    window_size = case + 1  # e.g., for case=2, window_size = 3
    # The interior zeros count is (window_size - 2) which is case - 1
    pattern1 = np.array([-1] + [0]*(case - 1) + [1])
    pattern2 = np.array([1] + [0]*(case - 1) + [-1])
    # print(pattern1,pattern2)
    
    # Manually construct sliding windows
    windows = np.array([arr[i:i + window_size] for i in range(len(arr) - window_size + 1)])
    # print("Sliding windows:\n", windows) #TESTING
    
    #THE ALGORITHM
    turb_d=[]
    turb_e=[]
    count=0;max_iter=len(data['time']);
    while np.any(((windows == pattern1) | (windows == pattern2)).all(axis=1)):
        count+=1; 
        if count>=max_iter: 
            print(count)
            break
        
        next_ind = np.where(((windows == pattern1) | (windows == pattern2)).all(axis=1))[0][0]
        
        if (windows[next_ind] == pattern1).all():
            turb_d.append(next_ind)
        elif (windows[next_ind] == pattern2).all(): 
            turb_e.append(next_ind) #append to list
    
        windows[0:next_ind+(case)+1,:] = 0 #removes from windows
    
    turb_d=np.array(turb_d,dtype=int); turb_e=np.array(turb_e,dtype=int)

    #EXTEND REST OF INDEXES TO PROCESS
    turb_d=extend_idxs(turb_d,case=case)
    turb_e=extend_idxs(turb_e,case=case)
    return turb_d,turb_e

In [None]:
# TESTING
# changes = np.array([0,0,0,-1,1,0,0,-1,0,0,0,1,-1,0,0])
# [a,b] = find_sandwiched_patterns(changes, case=1) #<=1 in a row timesteps are removed
# print("Case matches at indices:", a,b)

# changes = np.array([0,0,0,-1,0,1,0,0,-1,0,0,1,0,-1,0,0])
# [a,b] = find_sandwiched_patterns(changes, case=2) #<=2 in a row timesteps are removed
# print("Case matches at indices:", a,b)

# changes = np.array([0,0,0,-1,0,0,1,0,0,0,0,1,0,0,-1,0,0])
# [a,b] = find_sandwiched_patterns(changes, case=3) #<=3 in a row timesteps are removed
# print("Case matches at indices:", a,b)

# changes = np.array([0,0,0,-1,0,0,0,1,0,0,0,0,1,0,0,-1,0,0])
# [a,b] = find_sandwiched_patterns(changes, case=4) #<=4 in a row timesteps are removed
# print("Case matches at indices:", a,b)

# changes = np.array([0,0,0,-1,0,0,0,0,1,0,0,0,0,1,0,0,-1,0,0])
# [a,b] = find_sandwiched_patterns(changes, case=5) #<=5 in a row timesteps are removed
# print("Case matches at indices:", a,b)

In [None]:
###### (amount of time inside/outside of cloud to count as entrainment/detrainment)
mins_thresh=5 #5 mins
######

def get_changes(B):
    changes = np.diff(np.concatenate(([B[0]], B)))  # Add 0s to detect edges
    return changes
def PreProcessing(A,p): #,updraft_type

    # if updraft_type=='general':
    #     A=A_g.copy()
    # elif updraft_type=='cloudy':
    #     A=A_c.copy()
    B = A[:,p] #UNCOMMENT WHEN NOT TESTING***

    # Find the changes in the array
    changes=get_changes(B)
    # print(f'B = {B}'); print(f'changes = {changes}') 

    #Determining the Case Number
    t_per_mins=1/((data['time'][1]-data['time'][0])/1e9/60).item() #timesteps per minute (<=1)
    case=int(t_per_mins*mins_thresh) #UNCOMMENT WHEN NOT TESTING***
    
    if case>1:
        for case_ind in np.arange(case,0,-1): 
        # for case_ind in [case]:
            #Calling Algorithm and Correcting Parcel Data
            [turb_d,turb_e]=find_sandwiched_patterns(changes, case=case_ind)
            B[turb_d]=1
            B[turb_e]=0     
            changes=get_changes(B)
            # print(B)
    elif case==1:
        #Calling Algorithm and Correcting Parcel Data
        [turb_d,turb_e]=find_sandwiched_patterns(changes, case=case)
        B[turb_d]=1
        B[turb_e]=0
    return B

In [None]:
# #EXAMPLE TESTING #CASE COUNTDOWN
# B=np.array([1,1,1,1,0,0,0,1,1,1,1,0,0,0,1,0,0,1,1,1,0,0,0,1])
#                     #,#,#,        #,#,#       #,#,#
# print(B)

# #APPLYING
# changes=get_changes(B)
# [turb_d,turb_e]=find_sandwiched_patterns(changes,case=3)
# print(turb_d,turb_e)

# B[turb_d]=1
# B[turb_e]=0
# print(B)

# #APPLYING
# changes=get_changes(B)
# [turb_d,turb_e]=find_sandwiched_patterns(changes,case=2)
# print(turb_d,turb_e)

# B[turb_d]=1
# B[turb_e]=0
# print(B)

# #APPLYING
# changes=get_changes(B)
# [turb_d,turb_e]=find_sandwiched_patterns(changes,case=1)
# print(turb_d,turb_e)

# B[turb_d]=1
# B[turb_e]=0
# print(B)

In [None]:
# # TESTING
# # Case 1
# case=1
# B=np.array([1,0,1,1,0,0,1,0])
# B=np.array([1,0,1,1,0,1,0]) 
# B=np.array([1,0,1,1,0,1,0,1])

# B=np.array([1,0,1,0,1,1,1])
# B=np.array([1,0,1,0,1,0,0])
# B=np.array([0,1,0,1,0,0,0]) 
# B=np.array([0,1,0,1,0,1,1])

# # Case 2
# case=2
# B=np.array([1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0])
# B=np.array([1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0]) #101 should still get removed

# # Case 3
# case=3
# B=np.array([1,1,1,1,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1])
# print(f'output =  {B}\n')
# p=1234; out=PreProcessing(p,updraft_type='cloudy') #TESTING
# print(f'output =  {out}\n')

# # Case 5
# case=5
# B=np.array([1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1])
# print(f'output =  {B}\n')
# p=1234; out=PreProcessing(p,updraft_type='cloudy') #TESTING
# print(f'output =  {out}\n')

# # REAL CASE
# count_per_row = (A_c >= 1).sum(axis=0)
# where=np.where(count_per_row > 10)[0] # Find row indices where count is greater than 10
# # print(where)
# ind=12345; p=where[ind]; print(p) 

# print(A_c[:,p])
# out=PreProcessing(p,updraft_type='cloudy') #TESTING
# print(f'output =  {out}\n')

In [None]:
#OLD NONSIMULTANEOUS VERSION
# #RUNNING
# A_g_Processed=A_g.copy()
# A_c_Processed=A_c.copy()
# print('processing parcels for general updrafts')
# Np=len(parcel['xh'])
# for p in np.arange(Np):
#     # if p==1000:break #TESTING
#     if np.mod(p,1e3)==0: print(f"{p}/{len(parcel['xh'])}")
#     out=PreProcessing(p,updraft_type='general'); A_g_Processed[:,p]=out
    
# print('processing parcels for cloudy updrafts')
# for p in np.arange(Np):
#     # if p==1000:break #TESTING
#     if np.mod(p,1e3)==0: print(f"{p}/{len(parcel['xh'])}")
#     out=PreProcessing(p,updraft_type='cloudy'); A_c_Processed[:,p]=out

In [None]:
#SIMULTANEOUS VERSION
#RUNNING
A_g_Processed=A_g.copy()
A_c_Processed=A_c.copy()
print('processing parcels for both general and cloudy updrafts')
Np=len(parcel['xh'])
for p in np.arange(Np):
    # if p==1000:break #TESTING
    if np.mod(p,1e3)==0: print(f"{p}/{len(parcel['xh'])}")
    out1=PreProcessing(A_g,p); A_g_Processed[:,p]=out1
    out2=PreProcessing(A_c,p); A_c_Processed[:,p]=out2

In [None]:
#SAVING
mins_thresh=5 #5 minutes

dir2=dir+'Project_Algorithms/Entrainment/job_out_3/'
out_file=dir2+f'processed_binary_arrays_{res}_{t_res}_{Np_str}_{job_id}.h5'

vars=['A_g_Processed','A_c_Processed']
initiate_array(out_file,vars,t_chunk_size=50,p_chunk_size=1000)

with h5py.File(out_file, 'a') as f: 
    f['A_g_Processed'][:]=A_g_Processed
    f['A_c_Processed'][:]=A_c_Processed

In [None]:
#COMBINING JOB_ARRAYS AFTER RUNNING
########################################################################
recombine=False #KEEP FALSE WHEN JOB ARRAY IS RUNNING
recombine=True 

In [None]:
if recombine==True:
    dir2=dir+'Project_Algorithms/Entrainment/job_out_3/'
    dir3=dir+'Project_Algorithms/Entrainment/'
    out_file=dir3+f'processed_binary_arrays_{res}_{t_res}_{Np_str}.h5'
    
    vars=['A_g_Processed','A_c_Processed']
    initiate_array(out_file,vars,t_chunk_size=50,p_chunk_size=100_000)
    
    with h5py.File(out_file, 'r+') as f_out:
        
        num_jobs=60
        for job_id in np.arange(1,num_jobs+1):
            if np.mod(job_id,5)==0: print(f"job_id = {job_id}")
            [a,b] = get_job_range(job_id,num_jobs)
        
            in_file=dir2+f'processed_binary_arrays_{res}_{t_res}_{Np_str}_{job_id}.h5'
    
            with h5py.File(in_file, 'r') as f_in: 
                for var in vars:
                    f_out[var][:,a:b]=f_in[var][:]

In [None]:
#READING BACK IN AND TESTING
############################

In [None]:
start_job=0;end_job=10000

In [None]:
# Reading Back Data Later
##############
def make_data_dict(in_file,var_names,read_type):
    if read_type=='h5py':
        with h5py.File(in_file, 'r') as f:
            data_dict = {var_name: f[var_name][:,start_job:end_job] for var_name in var_names}
            
    elif read_type=='xarray':
        in_data = xr.open_dataset(
            in_file,
            engine='h5netcdf',
            phony_dims='sort',
            chunks={'phony_dim_0': 100, 'phony_dim_1': 1_000_000} 
        )
        data_dict = {k: in_data[k][:,start_job:end_job].compute().data for k in var_names}
    return data_dict

# read_type='xarray'
read_type='h5py'

In [None]:
import h5py
dir2=dir+'Project_Algorithms/Lagrangian_Arrays/'
in_file=dir2+f'lagrangian_binary_array_{res}_{t_res}_{Np_str}.h5'

var_names = ['A_g', 'A_c']
data_dict = make_data_dict(in_file,var_names,read_type)
[A_g, A_c] = (data_dict[k] for k in var_names) #, W

check_memory(globals())

In [None]:
#READING BACK IN
dir2=dir+'Project_Algorithms/Entrainment/'
in_file=dir2+f'processed_binary_arrays_{res}_{t_res}_{Np_str}.h5'

var_names = ['A_g_Processed', 'A_c_Processed']
data_dict = make_data_dict(in_file,var_names,read_type)
A_g_Processed, A_c_Processed = (data_dict[k] for k in var_names)
check_memory(globals())

In [None]:
ind=290 #time index
where=np.where(A_c[ind]!=A_c_Processed[ind])
where #which parcels have differences

In [None]:
# ind2=0 #choose one of those parcels
ind2+=1
where2=where[0][ind2]
def format_array(arr):
    arr = np.atleast_2d(arr)  # Ensures it's 2D
    return '\n'.join(''.join('_' if val == 0 else '1' for val in row) for row in arr)

print("A_c:")
print(format_array(A_c[:, where2]))

print("A_c_Processed:")
print(format_array(A_c_Processed[:, where2]))


In [None]:
# #PLUGGING INTO ALGORITHM TO SEE WHERE THINGS WENT WRONG
# B=A_c[:,where2]
# print(B)
# PreProcessing(p=where2,updraft_type='cloudy')

In [None]:
# def extend_idxs(f,case):
#     out=np.sort(np.add.outer(f, np.arange(case)).ravel())

#     # #OLD METHOD (SLOW)
#     # if np.any(f)==True:
#     #     out=np.sort(np.concatenate([np.arange(idx, idx + case-1+1) for idx in f]))
#     # else: 
#     #     out=f
#     return out

# def find_sandwiched_patterns(changes, case):
#     arr=changes
    
#     window_size = case + 1  # e.g., for case=2, window_size = 3
#     # The interior zeros count is (window_size - 2) which is case - 1
#     pattern1 = np.array([-1] + [0]*(case - 1) + [1])
#     pattern2 = np.array([1] + [0]*(case - 1) + [-1])
#     # print(pattern1,pattern2)
    
#     # Manually construct sliding windows
#     windows = np.array([arr[i:i + window_size] for i in range(len(arr) - window_size + 1)])
#     # print("Sliding windows:\n", windows) #TESTING
    
#     #THE ALGORITHM
#     turb_d=[]
#     turb_e=[]
#     count=0;max_iter=len(data['time']);
#     while np.any(((windows == pattern1) | (windows == pattern2)).all(axis=1)):
#         count+=1; 
#         if count>=max_iter: 
#             print(count)
#             break
        
#         next_ind = np.where(((windows == pattern1) | (windows == pattern2)).all(axis=1))[0][0]
        
#         if (windows[next_ind] == pattern1).all():
#             turb_d.append(next_ind)
#         elif (windows[next_ind] == pattern2).all(): 
#             turb_e.append(next_ind) #append to list
    
#         windows[0:next_ind+(case)+1,:] = 0 #removes from windows
    
#     turb_d=np.array(turb_d,dtype=int); turb_e=np.array(turb_e,dtype=int)

#     #EXTEND REST OF INDEXES TO PROCESS
#     turb_d=extend_idxs(turb_d,case=case)
#     turb_e=extend_idxs(turb_e,case=case)
#     return turb_d,turb_e

In [None]:
# ###### (amount of time inside/outside of cloud to count as entrainment/detrainment)
# mins_thresh=5 #5 mins
# ######

# def get_changes(B):
#     changes = np.diff(np.concatenate(([B[0]], B)))  # Add 0s to detect edges
#     return changes
# def PreProcessing(p,updraft_type):

#     if updraft_type=='general':
#         A=A_g.copy()
#     elif updraft_type=='cloudy':
#         A=A_c.copy()
#     # B = A[:,p] #UNCOMMENT WHEN NOT TESTING***

#     # Find the changes in the array
#     changes=get_changes(B)
#     # print(f'B = {B}'); print(f'changes = {changes}') 

#     #Determining the Case Number
#     t_per_mins=1/((data['time'][1]-data['time'][0])/1e9/60).item() #timesteps per minute (<=1)
#     case=int(t_per_mins*mins_thresh) #UNCOMMENT WHEN NOT TESTING***
    
#     if case>1:
#         for case_ind in np.arange(case,0,-1): 
#         # for case_ind in [case]:
#             #Calling Algorithm and Correcting Parcel Data
#             [turb_d,turb_e]=find_sandwiched_patterns(changes, case=case_ind)
#             B[turb_d]=1
#             B[turb_e]=0     
#             changes=get_changes(B)
#             # print(B)
#     elif case==1:
#         #Calling Algorithm and Correcting Parcel Data
#         [turb_d,turb_e]=find_sandwiched_patterns(changes, case=case)
#         B[turb_d]=1
#         B[turb_e]=0
#     return B