In [1]:
########################################
#getting system arguments
import sys
def GetArg_dataName(default="Variables"):
    """
    Safely retrieve dataName from sys.argv.
    #Run One: python Tracked_Profiles.py Variables
    #Run Two: python Tracked_Profiles.py Entrainment
    #Run Three: python Tracked_Profiles.py PROCESSED_Entrainment
    #Run Four: python Tracked_Profiles.py W_Budgets
    #Run Five: python Tracked_Profiles.py QV_Budgets
    #Run Six: python Tracked_Profiles.py TH_Budgets
    """
    # If run inside Jupyter, sys.argv will include ipykernel arguments
    if any("ipykernel_launcher" in arg for arg in sys.argv):
        print(f"Using default dataName: {default}")
        return default

    # If a user-specified argument exists, use it
    if len(sys.argv) > 1:
        out=sys.argv[1]
        print(f"Using argument dataName: {out}")
        return out

    return default

dataName = GetArg_dataName()

Using default dataName: Variables


In [2]:
####################################
#ENVIRONMENT SETUP

In [3]:
#Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import ScalarFormatter
import matplotlib.gridspec as gridspec
import xarray as xr

import sys; import os; import time; from datetime import timedelta
import pickle
import h5py

In [4]:
#MAIN DIRECTORIES
def GetDirectories():
    mainDirectory='/mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/'
    mainCodeDirectory=os.path.join(mainDirectory,"Code/CodeFiles/")
    scratchDirectory='/mnt/lustre/koa/scratch/air673/'
    codeDirectory=os.getcwd()
    return mainDirectory,mainCodeDirectory,scratchDirectory,codeDirectory

[mainDirectory,mainCodeDirectory,scratchDirectory,codeDirectory] = GetDirectories()

In [5]:
#IMPORT CLASSES
sys.path.append(os.path.join(mainCodeDirectory,"2_Variable_Calculation"))
from CLASSES_Variable_Calculation import ModelData_Class, SlurmJobArray_Class, DataManager_Class

In [6]:
#IMPORT FUNCTIONS
sys.path.append(os.path.join(mainCodeDirectory,"2_Variable_Calculation"))
import FUNCTIONS_Variable_Calculation
from FUNCTIONS_Variable_Calculation import *

In [7]:
#data loading class
ModelData = ModelData_Class(mainDirectory, scratchDirectory, simulationNumber=1)
#data manager class
DataManager = DataManager_Class(mainDirectory, scratchDirectory, ModelData.res, ModelData.t_res, ModelData.Nz_str,
                                ModelData.Np_str, dataType="Tracking_Algorithms", dataName="Lagrangian_UpdraftTracking",
                                dtype='float32',codeSection = "Project_Algorithms")

=== CM1 Data Summary ===
 Simulation #:   1
 Resolution:     1km
 Time step:      5min
 Vertical levels:34
 Parcels:        1e6
 Data file:      /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Model/cm1r20.3/run/cm1out_1km_5min_34nz.nc
 Parcel file:    /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Model/cm1r20.3/run/cm1out_pdata_1km_5min_1e6np.nc
 Time steps:     133

=== DataManager Summary ===
 inputDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Variable_Calculation/TimeSplitModelData
 outputDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Project_Algorithms/Tracking_Algorithms
 inputDataDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Variable_Calculation/TimeSplitModelData/1km_5min_34nz/ModelData
 inputParcelDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/D

In [8]:
#data manager class (for saving data)
DataManager_TrackedProfiles = DataManager_Class(mainDirectory, scratchDirectory, ModelData.res, ModelData.t_res, ModelData.Nz_str,
                                ModelData.Np_str, dataType="Tracked_Profiles", dataName="Tracked_Profiles",
                                dtype='float32',codeSection = "Project_Algorithms")

=== DataManager Summary ===
 inputDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Variable_Calculation/TimeSplitModelData
 outputDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Project_Algorithms/Tracked_Profiles
 inputDataDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Variable_Calculation/TimeSplitModelData/1km_5min_34nz/ModelData
 inputParcelDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Variable_Calculation/TimeSplitModelData/1km_5min_34nz/ParcelData
 outputDataDirectory #:   /mnt/lustre/koa/koastore/torri_group/air_directory/Projects/DCI-Project/Code/OUTPUT/Project_Algorithms/Tracked_Profiles/1km_5min_34nz/Tracked_Profiles



In [9]:
#IMPORT CLASSES
sys.path.append(os.path.join(mainCodeDirectory,"3_Project_Algorithms","2_Tracking_Algorithms"))
from CLASSES_TrackingAlgorithms import TrackingAlgorithms_DataLoading_Class, Results_InputOutput_Class, TrackedParcel_Loading_Class

In [12]:
# IMPORT CLASSES
sys.path.append(os.path.join(mainCodeDirectory,"3_Project_Algorithms","3_Tracked_Profiles"))
from CLASSES_TrackedProfiles import TrackedProfiles_DataLoading_CLASS

In [13]:
##############################################
#JOB ARRAY

In [25]:
#JOB ARRAY SETUP
UsingJobArray=True

def GetNumJobs(res):
    if res=='1km':
        num_jobs=20
    elif res=='250m': 
        num_jobs=100
    return num_jobs
num_jobs = GetNumJobs(ModelData.res)
SlurmJobArray = SlurmJobArray_Class(total_elements=ModelData.Ntime, num_jobs=num_jobs, UsingJobArray=UsingJobArray)
start_job = SlurmJobArray.start_job; end_job = SlurmJobArray.end_job

def GetNumElements():
    num_elements = np.arange(ModelData.Ntime)[start_job:end_job]
    return num_elements
num_elements = GetNumElements()

Running timesteps from 0:132 



In [15]:
##############################################
#DATA LOADING FUNCTIONS

In [16]:
def MakeDataDictionary(variableNames,t,printstatement=False):
    timeString = ModelData.timeStrings[t]
    # print(f"Getting data from {timeString}","\n")
    
    dataDictionary = {variableName: CallLagrangianArray(ModelData, DataManager, timeString, variableName=variableName, printstatement=printstatement) 
                      for variableName in variableNames}      
    return dataDictionary
    
def GetSpatialData(t):    
    variableNames = ['Z']
    dataDictionary = MakeDataDictionary(variableNames,t)
    [Z] = (dataDictionary[k] for k in variableNames)
    return Z

In [17]:
####################################
#RUN SETUP

In [18]:
#data variable list
def GetVarNames(dataName): 
    if dataName=="Variables":
        varNames = ['W', 'QCQI', 'RH_vapor', 'theta_v', 'theta_e', 'MSE', 'HMC','VMF_g','VMF_c']
    elif dataName == "Entrainment":
        varNames = ['Entrainment_g','Entrainment_c',
                    'TransferEntrainment_g',
                    'TransferEntrainment_c']
        varNames += ['Detrainment_g','Detrainment_c',
                     'TransferDetrainment_g',
                     'TransferDetrainment_c']
    elif dataName == "PROCESSED_Entrainment":
        varNames = ['PROCESSED_Entrainment_g','PROCESSED_Entrainment_c',
                    'PROCESSED_TransferEntrainment_g',
                    'PROCESSED_TransferEntrainment_c']
        varNames += ['PROCESSED_Detrainment_g','PROCESSED_Detrainment_c',
                     'PROCESSED_TransferDetrainment_g',
                     'PROCESSED_TransferDetrainment_c']
    elif dataName=="W_Budgets":
        varNames = ["wb_hadv", "wb_vadv", "wb_hidiff", "wb_vidiff", 
                    "wb_hturb", "wb_vturb", "wb_pgrad", "wb_buoy"]
    elif dataName=="QV_Budgets":
        varNames = ["qvb_hadv", "qvb_vadv", "qvb_hidiff", "qvb_vidiff", 
                    "qvb_hturb", "qvb_vturb", "qvb_mp"]
    elif dataName=="TH_Budgets":
        varNames = ["ptb_hadv", "ptb_vadv", "ptb_hidiff", "ptb_vidiff", 
                 "ptb_hturb", "ptb_vturb", "ptb_mp", "ptb_rad", "ptb_div", "ptb_diss"]
    return varNames

In [19]:
########################################
#RUNNING FUNCTIONS

In [26]:
#Functions for Initializing Profile Arrays
def CopyStructure(dictionary, placeholder=None):
    """Deep-copy dictionary structure, replacing leaves with a given placeholder."""
    if isinstance(dictionary, dict):
        return {k: CopyStructure(v, placeholder) for k, v in dictionary.items()}
    else:
        return placeholder

def InitializeProfileArrays(trackedArrays, varNames, zhs=ModelData.zh):
    """
    Create a new dictionary with the same nested structure as trackedArrays,
    and for each variable name, create:
        - 'profile_array'
        - 'profile_array_squares'
    Each array has shape (len(zhs), 3) and zhs in the last column.
    """
    profileArraysDictionary = {}

    for category, depth_dict in trackedArrays.items():  # e.g. 'CL', 'SBF'
        profileArraysDictionary[category] = {}

        for depth_type in depth_dict.keys():  # e.g. 'ALL', 'SHALLOW', 'DEEP'
            profileArraysDictionary[category][depth_type] = {}

            for varName in varNames:
                # Create base profile array
                base_profile = np.zeros((len(zhs), 3))
                base_profile[:, 2] = zhs

                profileArraysDictionary[category][depth_type][varName] = {
                    "profile_array": base_profile.copy(),
                    "profile_array_squares": base_profile.copy()
                }
    return profileArraysDictionary

In [27]:
def GetParcelNumbers(trackedArray, t):
    """
    Return all parcel indices (p) and their corresponding row indices
    for parcels that are active at time t.
    Vectorized, no row-by-row loops.
    """
    t_start = trackedArray[:, 1]
    t_end   = np.minimum(trackedArray[:, 2] + trackedArray[:, 3], ModelData.Ntime)

    # Boolean mask for rows active at time t
    mask = (t >= t_start) & (t <= t_end)

    # Extract parcel numbers and their corresponding row indices
    selectedRows = np.where(mask)[0]
    selectedPs = trackedArray[selectedRows, 0]

    return selectedPs, selectedRows

In [28]:
def MakeTrackedProfiles(trackedArrays,profileArraysDictionary,varNames,VARs,Z,t):
    """
    Update profileArraysDictionary with variable data for parcels active at time t.
    Accumulates sums and counts in both profile_array and profile_array_squares.
    """
    #CALCULATING
    for key1, subdict in trackedArrays.items():         # e.g. 'CL', 'SBF'
        print("\t",f'working on {key1}')
        for key2, trackedArray in subdict.items():           # e.g. 'ALL', 'DEEP'
            print("\t\t",f'working on {key2}')
            #loading the profile array to fill
            profileArray = profileArraysDictionary[key1][key2] 
    
            #getting parcels in trackedArray to run through
            selectedPs, _ = GetParcelNumbers(trackedArray, t) #get parcels that are counted at time t
    
            #getting Z data
            zLevels = Z[selectedPs]
            for varName in varNames:
                #getting data
                results = VARs[varName][selectedPs]
                #appending data to profile arrays
                for arrayName in ["profile_array","profile_array_squares"]:
                    np.add.at(profileArray[varName][arrayName][:,0], zLevels, results) 
                    np.add.at(profileArray[varName][arrayName][:,1], zLevels, 1) 
    return profileArraysDictionary

In [29]:
########################################
#RUNNING

In [30]:
#Loading in Tracked Parcels Info
trackedArrays,LevelsDictionary = TrackedParcel_Loading_Class.LoadingSubsetParcelData(ModelData,DataManager,
                                                         Results_InputOutput_Class)
# Get Variable Names
varNames = GetVarNames(dataName)
varNames = varNames[0:1] #*#* need to get all variables working

CL: ALL=12239, SHALLOW=8532, DEEP=1349
nonCL: ALL=10043, SHALLOW=7684, DEEP=1113
SBF: ALL=1905, SHALLOW=961, DEEP=489
ColdPool: ALL=10334, SHALLOW=7571, DEEP=860
Mean Cloudbase is: 1.45 km

Min Cloudbase is: 1.25 km

Mean LFC is: 1.93 km

Mean LCL is: 1.79 km

Min LFC is: 1.45 km

Min LCL is: 1.37 km



In [None]:
for t in num_elements:
    if dataName in ['Entrainment','PROCESSED_Entrainment'] and t == ModelData.Ntime-1:
        continue
    print("#" * 40,"\n",f"Processing timestep {t}/{num_elements[-1]}")
    timeString = ModelData.timeStrings[t]

    #Forming Dictionary for Profile Arrays for current timestep
    trackedProfileArrays = CopyStructure(trackedArrays)
    profileArraysDictionary = InitializeProfileArrays(trackedProfileArrays,varNames)
    
    #getting variable data
    VARs = MakeDataDictionary(varNames, t)
    Z = GetSpatialData(t)

    #making tracked profiles
    profileArraysDictionary = MakeTrackedProfiles(trackedArrays,profileArraysDictionary,varNames,VARs,Z,t)

    #saving tracked profiles for current timestep
    TrackedProfiles_DataLoading_CLASS.SaveProfile(ModelData,DataManager_TrackedProfiles, profileArraysDictionary, t)

In [None]:
#########################################
#RECOMBINE SEPERATE JOB_ARRAYS AFTER
recombine=False #KEEP FALSE WHEN JOBARRAY IS RUNNING
# recombine=True

In [None]:
#*#* need to instead combine timestep by timestep

In [None]:
def GetInputFile(data_type,job_id,dir2,PROCESSING):
    #CALLING IN DATA
    if PROCESSING==True:
        input_file=dir2 + f"{data_type}_" + f"tracked_profiles_{res}_{t_res}_{Np_str}_{job_id}.h5"
    elif PROCESSING==False:
        input_file=dir2 + f"{data_type}_" + f"tracked_profiles_{res}_{t_res}_{Np_str}_ORIGINAL_{job_id}.h5"
    return input_file
def GetOutputFile(data_type,type1,type2,dir3,PROCESSING):
    if type2==None:
        if PROCESSING==True:
            output_file=dir3 + f"{data_type}_" + f"{type1}_tracked_profiles_{res}_{t_res}_{Np_str}.h5"
        elif PROCESSING==False:
            output_file=dir3 + f"{data_type}_" + f"{type1}_tracked_profiles_{res}_{t_res}_{Np_str}_ORIGINAL.h5"
    else:
        if PROCESSING==True:
            output_file=dir3 + f"{data_type}_" + f"{type1}_{type2}_tracked_profiles_{res}_{t_res}_{Np_str}.h5"
        elif PROCESSING==False:
            output_file=dir3 + f"{data_type}_" + f"{type1}_{type2}_tracked_profiles_{res}_{t_res}_{Np_str}_ORIGINAL.h5"
    return output_file
def Recombine(type1,type2,num_jobs):
    global variables
    print(f'recombining {type1}_{type2}')
    if type2==None:
        type2s = [f"{type1}"]
    else:
        type2s = [f"{type1}",f"{type2}"]
    types = ["ALL", "SHALLOW", "DEEP"]

    dir2=dir+'Project_Algorithms/Tracked_Profiles/SBATCH/job_out2/'
    dir3=dir+'Project_Algorithms/Tracked_Profiles/OUTPUT_FILES/'
    
    #MAKING OUTPUT FILE PATH
    output_file=GetOutputFile(data_type,type1,type2,dir3,PROCESSING)

    job_id=1; input_file=GetInputFile(data_type,job_id,dir2,PROCESSING)
    with h5py.File(input_file, 'r') as f:
        all_keys = list(f.keys())
        vars_list = [k for k in all_keys if any(t2 in k for t2 in type2s)]

    
    #MAKING PROFILES DICTIONARY
    zhs = data1['zh'].values
    profiles = {}  # Store profiles for all variables
    for var in vars_list:
        profiles[var] = np.zeros((len(zhs), 3))  # column 1: var, column 2: counter, column 3: list of zhs
        profiles[var][:, 2] = zhs 

    for job_id in np.arange(1,num_jobs+1):
        if np.mod(job_id,20)==0: print(f"job_id = {job_id}")
    
        #CALLING IN DATA
        input_file=GetInputFile(data_type,job_id,dir2,PROCESSING)
    
        #COMPILING PROFILES
        with h5py.File(input_file, 'r') as f:
            for var in vars_list:
                profiles[var][:,0:1+1]+=f[f'{var}'][:,0:1+1]
    
    #SAVING INTO FINAL FORM
    with h5py.File(output_file, 'w') as f:
        for var in profiles:
            profile_var = profiles[var]
            f.create_dataset(f'{var}', data=profile_var, compression="gzip")

In [None]:
if recombine==True:
    Recombine(type1='CL',type2='nonCL',num_jobs=num_jobs)
    Recombine(type1='SBZ',type2='nonSBZ',num_jobs=num_jobs)
    Recombine(type1='ColdPool',type2=None,num_jobs=num_jobs)

In [330]:
##############################################
#TESTING

In [331]:
# #testing: comparing above vectorized version to multi-loop version below

# #pure calculation method (slow)
# t=80

# a = trackedArrays['CL']['DEEP']
# ps = a[:,0]
# t1s = a[:,1]
# t2s = a[:,2]+a[:,3]

# profile=np.zeros((ModelData.Nzh,3))
# profile[:,2] = ModelData.zh

# plen = len(ps)
# for count, (p,t1,t2) in enumerate(zip(ps,t1s,t2s)):
#     if count % 100 == 0: print(f"{count}/{plen}")
#     ts = np.arange(t1, t2+1)
    
#     for t_ in ts:
#         if t_ != t:
#             continue
#         Z = GetSpatialData(t)[p]
#         var = MakeDataDictionary(["W"], t=t)["W"][p]
#         profile[Z,0]+=var
#         profile[Z,1]+=1


# #plotting

# def Plot(a,color):
#     b= a[:,0]/a[:,1]
#     # b*=1000
#     plt.plot(b,a[:,2],color=color)
# one = profileArraysDictionary['CL']['DEEP']['W']['profile_array'] #created using MakeTrackedProfiles function
# two = profile.copy()
# Plot(one,color='black')
# Plot(two,color='blue') #plots are the same
# np.all(one[:,0]==two[:,0]) #=True

In [394]:
# #testing file size of output dictionary
# def count_arrays(d):
#     """Recursively count total number of NumPy arrays in a nested dictionary."""
#     count = 0
#     if isinstance(d, dict):
#         for v in d.values():
#             count += count_arrays(v)
#     elif isinstance(d, np.ndarray):
#         count += 1
#     return count

# # usage
# total_arrays = count_arrays(profileArraysDictionary)
# print(f"Total arrays stored: {total_arrays}")

# res_per_array = (ModelData.Nzh*3)
# kb_per_array = res_per_array*4/1e3

# out = total_arrays*kb_per_array

# print(f"total of {out} KB") #~25KB per dictionary with 60 profiles within
# #true number is more like 512KB or 0.5MB