# Steps to EQ Rotta

*Authors: Antony Sikorski, Sara Shamekh*

My modifications to the original EQ-Rotta notebook. 

In [2]:
# from pdfs import *
import os

import xarray as xr
import netCDF4 as nc
import h5netcdf

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

Should return all files and folders that start with 'Ug', although in our case right now it should just be files: 

In [None]:
def list_directories_files(path):
    # all items in the given path
    items = os.listdir(path)
    
    # find the directories starting with 'Ug'
    directories = [item for item in items if os.path.isdir(os.path.join(path, item)) and item.startswith('Ug')]
    # find the files starting with 'Ug'
    files = [item for item in items if os.path.isfile(os.path.join(path, item)) and item.startswith('Ug')]
    
    return directories, files


This lists all of our files. **All that needs to be changed is the path.** 

In [None]:
path = 'C:/Users/anton/Desktop/Career/LEAP_nyc_Summer2024/les_sim_2/'


directories, files = list_directories_files(path)
print("Directories starting with 'Ug':", directories)
print("Files starting with 'Ug':", files)

In [None]:
# nc_test1 = nc.Dataset(os.path.join('C:/Users/anton/Desktop/Career/LEAP_nyc_Summer2024/les_sim_2/', 'Ug2Q010_I.nc'), mode='r')

# z_256 = nc_test1.variables['z'][:]
# zh_257 = nc_test1.variables['zh'][:]
# t_120 = nc_test1.variables['time'][:]

# #rewrite this code but using 'Ug2Q010_IV.nc' instead
# nc_test2 = nc.Dataset(os.path.join('C:/Users/anton/Desktop/Career/LEAP_nyc_Summer2024/les_sim_2/', 'Ug2Q010_IV.nc'), mode='r')

# z_384 = nc_test2.variables['z'][:]
# zh_385 = nc_test2.variables['zh'][:]
# t_90 = nc_test2.variables['time'][:]

# # print(len(z_256), len(zh_257), len(t_120), len(z_384), len(zh_385), len(t_90))

In [None]:
def time_average(data, timeavg):
    """
    Averages an array over specified number of time steps. Works for both 1D and higher-dimensional arrays.

    Parameters:
    - data (numpy.ndarray): The input data array. Expected shapes are either (ntime,) or (ntime, nz).
    - timeavg (int): The number of time steps over which to average.

    Returns:
    - numpy.ndarray: The averaged data array.
    """
    ntime = data.shape[0]
    nchunks = ntime // timeavg
    truncated_data = data[:nchunks * timeavg]

    if data.ndim == 1:
        # for 1D arrays, reshape to (nchunks, timeavg)
        reshaped_data = truncated_data.reshape(nchunks, timeavg)
    else:
        # for 2D arrays, old method with reshaping
        nz = data.shape[1]
        reshaped_data = truncated_data.reshape(nchunks, timeavg, nz)

    # compute the mean along the new time axis 
    averaged_data = reshaped_data.mean(axis=1)

    return averaged_data


In [None]:
items = files

time_avg = 15

for item in items:
    ds_stat = nc.Dataset( os.path.join(path, item), mode='r')
    if 'budget' in ds_stat.groups:
        print ("budget is in", item)
    else:
        print ("budget is not in", item)
    

It appears that we can use all files except the very first (000) file. Going to print out all of the viable files so we can see dimension sizes and groups: 

In [None]:
for item in items[1:]:
    print(item)
    df = nc.Dataset(path + '/' + item, mode='r')
    print(df)

This function should create variables for a given list of `items`. In my case, my `selected_files` constitutes a group of similar (I think) simulations that I can append together for the sake of having more data.  

In [None]:
#selected_files = [items[1], items[2], items[4], items[5]]
# selected_files = [items[1]]
#selected_files = [items[1], items[2], items[4], items[5], items[6], items[8],items[9], items[11], items[12], items[13], items[14]]

target_z_dim = 384

selected_files = []

for item in items[1:]:
        df = nc.Dataset(os.path.join(path, item), mode='r')
        if df.dimensions['z'].size == target_z_dim:
            selected_files.append(item)
        df.close()

print(len(selected_files))

In [None]:
df = nc.Dataset(os.path.join(path, selected_files[0]), mode='r')   

for group_name, group in df.groups.items():
    print(f"Group: {group_name}")
    for var_name, var in group.variables.items():
        long_name = var.getncattr('long_name') if 'long_name' in var.ncattrs() else 'No long_name attribute'
        print(f"  Variable: {var_name}, Long Name: {long_name}")

In [None]:
def make_variables(path, items, time_avg):
    # Initialize empty lists to accumulate results
    sigma_th = []
    sigma_2 = []
    Theta = []
    wtheta = []
    wwtheta = []
    rdstr = []
    transport = []

    for item in items:
        ds_stat = nc.Dataset(os.path.join(path, item), mode='r')
        
        sigma_th_temp = time_average(ds_stat.groups['thermo']['th_2'][:], time_avg)  # covariance of theta
        sigma_2_temp = time_average(ds_stat.groups['default']['w_2'][:], time_avg)   # covariance of w
        Theta_temp = time_average(ds_stat.groups['thermo']['th'][:], time_avg)       # domain mean theta
        wtheta_temp = time_average(ds_stat.groups['thermo']['th_flux'][:], time_avg) # heat flux
        wwtheta_temp = time_average(ds_stat.groups['budget']['wwtheta'][:], time_avg) # third moment, covariance between wtheta and w
        rdstr_temp = time_average(ds_stat.groups['budget']['bw_rdstr'][:], time_avg)
        transport_temp = time_average(ds_stat.groups['budget']['bw_pres'][:], time_avg)

        # Append the results to the respective lists
        sigma_th.append(sigma_th_temp)
        sigma_2.append(sigma_2_temp)
        Theta.append(Theta_temp)
        wtheta.append(wtheta_temp)
        wwtheta.append(wwtheta_temp)
        rdstr.append(rdstr_temp)
        transport.append(transport_temp)

    # Concatenate the results along the time axis
    sigma_th = np.concatenate(sigma_th, axis=0)
    sigma_2 = np.concatenate(sigma_2, axis=0)
    Theta = np.concatenate(Theta, axis=0)
    wtheta = np.concatenate(wtheta, axis=0)
    wwtheta = np.concatenate(wwtheta, axis=0)
    rdstr = np.concatenate(rdstr, axis=0)
    transport = np.concatenate(transport, axis=0)

    return sigma_th, sigma_2, Theta, wtheta, wwtheta, rdstr, transport

In [None]:
sigma_th, sigma_2, Theta, wtheta, wwtheta, rdstr, transport = make_variables(path, selected_files, time_avg)

# z = np.arange(0, 384)
# zh = np.arange(0, 385)
# t = np.arange(0, 120)

z = ds_stat.variables['z'][:]
zh = ds_stat.variables['zh'][:]
t = ds_stat.variables['time'][:]


dTheta_dz = np.gradient(Theta, z, axis = 1)
dwwtheta_dz = np.gradient(wwtheta, zh, axis = 1)

print(zh.shape, z.shape, t.shape)

Let's take a look at all of the variables (including gradients) that we create. We print their:

Name, shape, mean, and standard deviation

In [None]:
print("dTheta_dz", dTheta_dz.shape, np.mean(dTheta_dz), np.std(dTheta_dz))
print("sigma_th", sigma_th.shape, np.mean(sigma_th), np.std(sigma_th))
print("sigma_2", sigma_2.shape, np.mean(sigma_2), np.std(sigma_2))
print("Theta", Theta.shape, np.mean(Theta), np.std(Theta))
print("wtheta", wtheta.shape, np.mean(wtheta), np.std(wtheta))
print("wwtheta", wwtheta.shape, np.mean(wwtheta), np.std(wwtheta))
print("dwwtheta_dz", dwwtheta_dz.shape, np.mean(dwwtheta_dz), np.std(dwwtheta_dz))
print("rdtsr", rdstr.shape, np.mean(rdstr), np.std(rdstr))
print("transport", transport.shape, np.mean(transport), np.std(transport))

In [None]:
print(wwtheta[1,0:3])
print((wwtheta[1,0]+wwtheta[1,1])/2, (wwtheta[1,1]+wwtheta[1,2])/2)

Variables that have a second dimension of 385 are calculated at zh values of 0, 6, 12, and so on. Variables with 384 are staggered and calculated in between there, at values of 3, 9, 15, and so on. We reshape the 385's to be 384's by taking the average of each consecutive pair. 

We test if this works above and below with old and new values of `wwtheta` (just a sanity check)

In [None]:
def reshape_variables(variable):
    reshaped = (variable[:, :-1] + variable[:, 1:]) / 2.0
    return reshaped

In [None]:
sigma_2 = reshape_variables(sigma_2)
wtheta = reshape_variables(wtheta)
wwtheta = reshape_variables(wwtheta)
dwwtheta_dz = reshape_variables(dwwtheta_dz)
rdstr = reshape_variables(rdstr)
transport = reshape_variables(transport)

In [None]:
print("dTheta_dz", dTheta_dz.shape, np.mean(dTheta_dz), np.std(dTheta_dz))
print("sigma_th", sigma_th.shape, np.mean(sigma_th), np.std(sigma_th))
print("sigma_2", sigma_2.shape, np.mean(sigma_2), np.std(sigma_2))
print("Theta", Theta.shape, np.mean(Theta), np.std(Theta))
print("wtheta", wtheta.shape, np.mean(wtheta), np.std(wtheta))
print("wwtheta", wwtheta.shape, np.mean(wwtheta), np.std(wwtheta))
print("dwwtheta_dz", dwwtheta_dz.shape, np.mean(dwwtheta_dz), np.std(dwwtheta_dz))

In [None]:
print(wwtheta[1,0:2])

Plotting all variables at a given timestep just for a sanity check: 

In [None]:
timeStep = 6

plt.figure(figsize = (10, 6))

# Plot potentialTemp
plt.subplot(1, 3, 1)
#plt.plot(Theta[timeStep], np.arange(0, len(Theta[timeStep,:])))
plt.plot(Theta[timeStep], z)
plt.title('Theta(th)')
# plt.xlabel('Value')
plt.ylabel('Depth')

# Plot th_diff
plt.subplot(1, 3, 2)
# plt.plot(dTheta_dz[timeStep], np.arange(0, len(dTheta_dz[timeStep,:])))
plt.plot(dTheta_dz[timeStep], z)
plt.title('dTheta_dz')
plt.xlabel('Value')
# plt.ylabel('Depth')

# Plot th_flux
plt.subplot(1, 3, 3)
# plt.plot(wtheta[timeStep], np.arange(0, len(wtheta[timeStep,:])))
plt.plot(wtheta[timeStep], z)
plt.title('wtheta (th_flux)')
# plt.xlabel('Value')
# plt.ylabel('Depth')


plt.figure(figsize = (12, 6))

# Plot potentialTemp
plt.subplot(1, 4, 1)
# plt.plot(sigma_th[timeStep], np.arange(0, len(sigma_th[timeStep,:])))
plt.plot(sigma_th[timeStep], z)
plt.title('theta2bar (sigma_th)')
# plt.xlabel('Value')
plt.ylabel('Depth')

# Plot th_diff
plt.subplot(1, 4, 2)
# plt.plot(sigma_2[timeStep], np.arange(0, len(sigma_2[timeStep,:])))
plt.plot(sigma_2[timeStep], z)
plt.title('sigma_w^2 (sigma_2)')
plt.xlabel('Value')
# plt.ylabel('Depth')

# Plot th_flux
plt.subplot(1, 4, 3)
# plt.plot(wwtheta[timeStep], np.arange(0, len(wwtheta[timeStep,:])))
plt.plot(wwtheta[timeStep], z)
plt.title('(wwtheta)')
# plt.xlabel('Value')
# plt.ylabel('Depth')

plt.subplot(1, 4, 4)
# plt.plot(dwwtheta_dz[timeStep], np.arange(0, len(dwwtheta_dz[timeStep,:])))
plt.plot(dwwtheta_dz[timeStep], z)
plt.title('(dwwtheta_dz)')
# plt.xlabel('Value')
# plt.ylabel('Depth')

## Diagnosing constants: 
Often we use conventional constant to normalize profiles. Here are some of the typical ones used in the PBL. 

AS added note: This modified function calculates the constants once again for a list of different simulations. 

In [None]:
grr = 9.8
T_0 = 300
beta = grr/T_0

def make_constants(path, items, time_avg):
    wtheta_surface = []
    pbl_height = []
    wstar = []
    theta_star = []
    scaling = []
    ustar = []

    for item in items:
        ds_stat = nc.Dataset(os.path.join(path, item), mode='r')

        wtheta_surface_raw = ds_stat.groups['thermo']['th_flux'][:,0] 
        wtheta_surface_temp = time_average(wtheta_surface_raw, time_avg)

        pbl_height_raw = ds_stat.groups['thermo'].variables['zi'][:] 
        pbl_height_temp = time_average(pbl_height_raw, time_avg)

        wstar_raw = np.power( beta * (wtheta_surface_raw) * pbl_height_raw , 1/3) 
        wstar_temp = time_average(wstar_raw, time_avg)

        theta_star_raw = wtheta_surface_raw / wstar_raw
        theta_star_temp = time_average(theta_star_raw, time_avg)

        scaling_raw = wstar_raw**2 * theta_star_raw / pbl_height_raw
        scaling_temp = time_average(scaling_raw, time_avg)

        ustar_temp = time_average(ds_stat.groups['default'].variables['ustar'][:], time_avg)

        wtheta_surface.append(wtheta_surface_temp)
        pbl_height.append(pbl_height_temp)
        wstar.append(wstar_temp)
        theta_star.append(theta_star_temp)
        scaling.append(scaling_temp)
        ustar.append(ustar_temp)

    wtheta_surface = np.concatenate(wtheta_surface, axis=0)
    pbl_height = np.concatenate(pbl_height, axis=0)
    wstar = np.concatenate(wstar, axis=0)
    theta_star = np.concatenate(theta_star, axis=0)
    scaling = np.concatenate(scaling, axis=0)
    ustar = np.concatenate(ustar, axis=0)

    wtheta_surface = wtheta_surface[:,np.newaxis]
    pbl_height = pbl_height[:,np.newaxis]
    wstar = wstar[:,np.newaxis]
    theta_star = theta_star[:,np.newaxis]
    scaling = scaling[:,np.newaxis]
    ustar = ustar[:,np.newaxis]

    return wtheta_surface, pbl_height, wstar, theta_star, scaling, ustar

In [None]:
wtheta_surface, pbl_height, wstar, theta_star, scaling, ustar = make_constants(path, selected_files, time_avg)

Once again printing 

Name, shape, mean, and standard deviation

of the constants (if they are time dependent). 

In [None]:
print("wtheta_surface", wtheta_surface.shape, np.mean(wtheta_surface), np.std(wtheta_surface))
print("pbl_height", pbl_height.shape, np.mean(pbl_height), np.std(pbl_height))
print("wstar", wstar.shape, np.mean(wstar), np.std(wstar))
print("theta_star", theta_star.shape, np.mean(theta_star), np.std(theta_star))
print("scaling", scaling.shape, np.mean(scaling), np.std(scaling))
print("ustar", ustar.shape, np.mean(ustar), np.std(ustar))
print("grr", grr)
print("T_0", T_0)
print("beta", beta)

## Compute the P term as residual
Equation 11 in the paper suggest that, assuming that the left hand side is smaller compared to terms on the right hnd side. 

AS added note: I compute the equation up to 384 as not all variables go up to 385. 



**Long description:** 

The equation in question is the heat flux budget in the convective boundary layer, which is written as: 

$$
\frac{\partial \overline{w \theta}}{\partial t}=0=\underbrace{-\sigma_w^2 \frac{\mathrm{d} \Theta}{\mathrm{d} z}}_M \underbrace{-\frac{\mathrm{d} \overline{w w \theta}}{\mathrm{d} z}}_T \underbrace{-\frac{1}{\rho_0} \theta \frac{\mathrm{d} p}{\mathrm{~d} z}}_P \underbrace{+\beta \overline{\theta^2}}_B
$$

Production terms: Mean gradient production ($M$), and bouyancy production ($B$). Destruction terms: turbulent flux transport ($T$) and the the pressure gradient–potential temperature covariance ($P$). 

$P$ acts as a destruction/sink term for the heat flux $\overline{w \theta}$. The general form for the parametrization of $P$ is

$$
P=-\frac{1}{\rho_0} \overline{\theta \frac{\mathrm{d} p}{\mathrm{~d} z}}=-C_1 \frac{\overline{w \theta}}{\tau_1} - C_2 \beta \overline{\theta^2} + C_3 \sigma_w^2 \frac{\mathrm{d} \Theta}{\mathrm{d} z}
$$

The first term $-C_1 \frac{\overline{w \theta}}{\tau_1}$ is the "slow" term. It is Rotta's return to isotropy part. Is inversely proportional to time scale $\tau_1$. I think it may also be called the "turbulence-turbulence" interaction. The last two terms are the "rapid" part, with $C_2 \beta \overline{\theta^2}$ being the bouyancy term, and $C_3 \sigma_w^2 \frac{\mathrm{d} \Theta}{\mathrm{d} z}$ being the shear-turbulence term. 

In [None]:
M = (- sigma_2 * dTheta_dz)
T = - dwwtheta_dz
B = (beta * sigma_th)

P = - M - T - B
P.shape

In [None]:
print(f"{np.mean(B):.7f}, {np.std(B):.7f}")
print(f"{np.mean(M):.7f}, {np.std(M):.7f}")
print(f"{np.mean(T):.7f}, {np.std(T):.7f}")
print(f"{np.mean(P):.7f}, {np.std(P):.7f}")

In [None]:
max(P[6])
min(P[6])

min(P[6])/max(P[6])

## Comparing P term and original rotta model plot
Lets reproduce plot 6 of the turbulece asymmetry paper to make sure we have all terms looking "somewhat" similar. "somewhat" because the rotta model may not be very accurate. 

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(P[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
plt.ylim(0,960)
plt.title('P')
plt.xlabel('Value')
plt.ylabel('Depth')

# Cutting out near surface part
Some of the assumptions we make when computing P as a residual may not hold near the surface. Therefore, before preparing data for EQ, we remove the near-surface part of the profile, typically the top 7-10 layers

In [None]:
P_trim = P[:, 8:155]
P_trim.shape

plt.figure(figsize = (10, 6))
plt.plot(P_trim[timeStep], z[8:155])
plt.title('P (surface removed)')
plt.xlabel('Value')
plt.ylabel('Depth')

## Comparing P to alternative method for calculating it

Here we calculate P directly rather than as a residual, and compare the plots. 

In [None]:
P_subtract = rdstr - transport

tau1 = 600 #unsure about this one at the moment, huge number makes it look correct though

P_direct = - (3 * wtheta/tau1) - (5/12 * beta * sigma_th) + (2/5 * sigma_2 * dTheta_dz)

plt.figure(figsize = (14, 6))

plt.subplot(1,3,1)
plt.plot(P_subtract[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
plt.ylim(45,1000)
plt.title('P (rdstr - transport)')
plt.xlabel('Value')
plt.ylabel('Depth')

plt.subplot(1,3,2)
plt.plot(P_direct[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
plt.title('P (using eq 12, tau = 600)')
plt.ylim(45,1000)
plt.xlabel('Value')
plt.ylabel('Depth')

plt.subplot(1,3,3)
plt.plot(P[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
plt.title('P (residual)')
plt.ylim(45,1000)
plt.xlabel('Value')
plt.ylabel('Depth')

Looking at `rdtsr` and `transport` alone is a good sanity check before we abandon their use entirely

In [None]:
plt.figure(figsize = (14, 6))

plt.subplot(1,3,1)
plt.plot(rdstr[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
# plt.ylim(45,1000)
plt.title('rdstr alone')
plt.xlabel('Value')
plt.ylabel('Depth')

plt.subplot(1,3,2)
plt.plot(transport[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
plt.title('transport alone')
# plt.ylim(45,1000)
plt.xlabel('Value')
plt.ylabel('Depth')

plt.subplot(1,3,3)
plt.plot(P[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z)
plt.title('P (residual)')
# plt.ylim(45,1000)
plt.xlabel('Value')
plt.ylabel('Depth')

Let's put all 3 P's on the same plot: 

In [None]:
plt.figure(figsize=(10, 6))

# Plot P_subtract
plt.plot(P_subtract[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z, label='P (rdstr - transport)')

# Plot P_direct
plt.plot(P_direct[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z, label='P (direct from eq 12, tau = 700)')

# Plot P residual
plt.plot(P[timeStep] * pbl_height[timeStep] / (wstar[timeStep] * theta_star[timeStep]), z, label='P (residual)')

plt.ylim(45, 1000)
plt.title('Comparison of P values')
plt.xlabel('Value')
plt.ylabel('Depth')
plt.legend()
plt.grid(True)

plt.show()

## Equation Discovery

We do this with the `PySR` package for symbolic regression. As stated by their github, this is "a machine learning task where the goal is to find an interpretable symbolic expression that optimizes some objective."

We import the library down here because sometimes it takes an annoyingly long number of seconds. 

In [None]:
import pysr
from pysr import PySRRegressor

import pandas as pd

### Step 1: All moments in time, one layer

Here we take a single layer, at perhaps the 60th index of z, like `z[60] = 363` across all 120 minutes of time. We will do this for all different versions of $P$. 

In [None]:
z[60]

indices = np.s_[:, 60]

In [None]:
y0 = P[indices]
y1 = P_direct[indices]
# y2 = P_subtract[:,60]

x0 = wtheta[indices]
x1 = sigma_th[indices]
x2 = sigma_2[indices]
x3 = dTheta_dz[indices]

X = np.column_stack([x0, x1, x2, x3])
print(y0.shape, y1.shape, X.shape)#, y2.shape)

Setting up our model with just basic unitary operators 

In [None]:
model0 = PySRRegressor(
    niterations=100,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    # unary_operators=[
    #     "cos",
    #     "exp",
    #     "sin",
    #     "inv(x) = 1/x",
    #     "square",
    #     "cube",
    #     # ^ Custom operator (julia syntax)
    # ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    # batching = True,
    # batch_size = 1000,
)

model1 = PySRRegressor(
    niterations=100,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    # unary_operators=[
    #     "cos",
    #     "exp",
    #     "sin",
    #     "inv(x) = 1/x",
    #     "square",
    #     "cube",
    #     # ^ Custom operator (julia syntax)
    # ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    # batching = True,
    # batch_size = 1000,
)

# model2 = PySRRegressor(
#     niterations=100,  # < Increase me for better results
#     binary_operators=["+", "*", "-", "/"],
#     # unary_operators=[
#     #     "cos",
#     #     "exp",
#     #     "sin",
#     #     "inv(x) = 1/x",
#     #     "square",
#     #     "cube",
#     #     # ^ Custom operator (julia syntax)
#     # ],
#     extra_sympy_mappings={"inv": lambda x: 1 / x},
#     # ^ Define operator for SymPy as well
#     elementwise_loss="loss(prediction, target) = (prediction - target)^2",
#     # ^ Custom loss function (julia syntax)
#     # batching = True,
#     # batch_size = 1000,
# )

In [None]:
model0.fit(X,y0)
model1.fit(X,y1)
# model2.fit(X,y2)

Looking at last equation:

In [None]:
for model in (model0, model1): #, model2):
    equations = model.equations_
    print(equations.shape)
    display(equations)
    print(model, "Final Equation: ")
    display(equations.iloc[-1, 4])

### Step 2: One moment in time, a few layers

This is probably too simple, but we'll see. Just going to take z from 60 to 80, at one point in time, and see what we get. 

In [None]:
wtheta[14,60:80]

indices = np.s_[14, 60:80]

In [None]:
y0 = P[indices]
y1 = P_direct[indices]
# y2 = P_subtract[:,60]

x0 = wtheta[indices]
x1 = sigma_th[indices]
x2 = sigma_2[indices]
x3 = dTheta_dz[indices]

X = np.column_stack([x0, x1, x2, x3])
print(y0.shape, y1.shape, X.shape)#, y2.shape)

In [None]:
model3 = PySRRegressor(
    niterations=100,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    # unary_operators=[
    #     "cos",
    #     "exp",
    #     "sin",
    #     "inv(x) = 1/x",
    #     "square",
    #     "cube",
    #     # ^ Custom operator (julia syntax)
    # ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    # batching = True,
    # batch_size = 1000,
)

model4 = PySRRegressor(
    niterations=100,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    # unary_operators=[
    #     "cos",
    #     "exp",
    #     "sin",
    #     "inv(x) = 1/x",
    #     "square",
    #     "cube",
    #     # ^ Custom operator (julia syntax)
    # ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    # batching = True,
    # batch_size = 1000,
)

In [None]:
model3.fit(X,y0)
model4.fit(X,y1)

In [None]:
for model in (model3, model4):
    equations = model.equations_
    print(equations.shape)
    display(equations)
    print(model, "Final Equation: ")
    display(equations.iloc[-1, 4])

### Step 3: All moments in time, a few layers

This is my current best bet. Might have to start using batching here. I think in the others we have too little data. Will take heights 60 to 80 and all instances in time

In [None]:
print(dTheta_dz[:, 60:80].shape, dTheta_dz[:, 60:80].ravel().shape)

indices = np.s_[:, 60:80]

In [None]:
y0 = P[indices].ravel()
y1 = P_direct[indices].ravel()
# y2 = P_subtract[:,60]

x0 = wtheta[indices].ravel()
x1 = sigma_th[indices].ravel()
x2 = sigma_2[indices].ravel()
x3 = dTheta_dz[indices].ravel()



X = np.column_stack([x0, x1, x2, x3])
print(y0.shape, y1.shape, X.shape)#, y2.shape)

In [None]:
model5 = PySRRegressor(
    niterations=100,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    # unary_operators=[
    #     "cos",
    #     "exp",
    #     "sin",
    #     "inv(x) = 1/x",
    #     "square",
    #     "cube",
    #     # ^ Custom operator (julia syntax)
    # ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    # batching = True,
    # batch_size = 1000,
)

model6 = PySRRegressor(
    niterations=100,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    # unary_operators=[
    #     "cos",
    #     "exp",
    #     "sin",
    #     "inv(x) = 1/x",
    #     "square",
    #     "cube",
    #     # ^ Custom operator (julia syntax)
    # ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    # batching = True,
    # batch_size = 1000,
)

In [None]:
model5.fit(X,y0)
model6.fit(X,y1)

In [None]:
for model in (model5, model6):
    equations = model.equations_
    print(equations.shape)
    display(equations)
    print(model, "Final Equation: ")
    display(equations.iloc[-1, 4])

timescale is in seconds so 600 or more is reasonable, on the order of 10 min or so. also $\frac{\text{h}}{\text{wstar}} = \tau$. $\tau$ is the timescale of circulation. h is `pbl_height`, and is the "length scale", while wstar is `wstar` which is the "velocity scale"

play with different symbolic regression parameters/settings (complexity, etc)

### Step 4: All moments in time, most layers

In [None]:
indices = np.s_[:, 0:200]

y = P[indices].ravel()

x0 = wtheta[indices].ravel()
x1 = sigma_th[indices].ravel()
x2 = sigma_2[indices].ravel()
x3 = dTheta_dz[indices].ravel()



X = np.column_stack([x0, x1, x2, x3])
print(y.shape, X.shape)

In [None]:
df_X = pd.DataFrame(X, columns=['wtheta', 'sigma_th', 'sigma_2', 'dTheta_dz'])
df_y = pd.DataFrame(y, columns=['P'])

In [None]:
model7 = PySRRegressor(
    niterations=2000,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    unary_operators=[
        "cos",
        "exp",
        "sin",
        "inv(x) = 1/x",
        "square",
        "cube",
        # ^ Custom operator (julia syntax)
    ],
    extra_sympy_mappings={"inv": lambda x: 1 / x},
    # ^ Define operator for SymPy as well
    elementwise_loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    batching = True,
    batch_size = 200,
    complexity_of_operators={"/": 1, "exp": 3, "*":1, "sin": 3, "cos": 3, "inv": 3, "square": 3, "cube": 3},
    # ^ Custom complexity of particular operators
    maxdepth=5,
    # ^ But, avoid deep nesting.
)

In [None]:
model7.fit(df_X,df_y)

In [None]:
for model in [model7]:
    equations = model.equations_
    print(equations.shape)
    display(equations)
    print("Final Equation: ")
    display(equations.iloc[-1, 4])

In [None]:
discovered_eq1 = (-0.004997506 * wtheta) - (sigma_th * 0.019068547)
discovered_eq2 = (((sigma_th * wtheta) + (sigma_2 * 0.4073766)) * dTheta_dz) - (0.02337299 * (sigma_th + (0.13268535 * wtheta)))

theoretical_eq = (0.4073766 * sigma_2 * dTheta_dz) - (0.02337299 *sigma_th) + (0.0031012533586965 * wtheta)

In [None]:
# discovered value in front of sigma_th is -0.019068547
C2 = 1/2

- C2*beta
#close enough

In [None]:
#discovered value in front of wtheta is -0.004997506

C1 = 3
print(- C1/tau1)



print(C1/0.004997506)

#reasonable tau is about 600, which is exactly what we have

pbl_height/wstar

In [None]:
print(f"{np.mean(wtheta) * -0.019068547:.7f}")

In [None]:
print(f"{np.mean(sigma_th) * -0.004997506:.7f}")

In [None]:
print(f"{np.mean(sigma_2 * dTheta_dz) * 2/5:.7f}")

In [None]:
timeStep = 10

plt.figure(figsize=(10, 6))

plt.plot(P[timeStep], z, label = 'P (residual, eq 11)')
# plt.plot(wtheta[timeStep] * -0.019068547, z, label = 'wtheta')
# plt.plot(sigma_th[timeStep] * -0.004997506, z, label = 'sigma_th')
# plt.plot(sigma_2[timeStep] * dTheta_dz[timeStep] * 2/5, z, label = 'sigma_2 * dTheta_dz')

#if u make the third term negative the game changes
plt.plot((- sigma_th[timeStep] * 0.004997506) + (- wtheta[timeStep] * 0.019068547) - (sigma_2[timeStep] * dTheta_dz[timeStep] * 2/5), z, label = '3 term sum', color = 'darkred')
plt.plot(discovered_eq2[timeStep], z, label = 'discovered eq2', color = 'red')
plt.plot(discovered_eq1[timeStep], z, label = 'discovered eq1', color = 'green')

plt.plot(theoretical_eq[timeStep], z, label = 'theoretical eq', color = 'goldenrod')

plt.plot(P_direct[timeStep], z, label = 'P (direct, eq12)')

plt.ylim(0, 1000)
plt.title('Comparison of P values')
plt.xlabel('Value')
plt.ylabel('Depth')
plt.legend()
plt.grid(True)

plt.show()

In [None]:
plt.scatter(y, model7.predict(X))
plt.xlabel('Truth')
plt.ylabel('Prediction')
plt.show()

In [16]:
def make_wtheta_hmin(path, items, time_avg):
    """
    Extracts the variables of interest from the LES simulations and averages them over specified number of time steps.

    Parameters:
    - path (str): The path to the LES simulation data.
    - items (list): A list of files containing the LES simulation data.
    - time_avg (int): The number of time steps over which to average.

    Returns:
    - time averaged wthetas
    - time averaged hmins
    """

    # Initialize empty lists to accumulate results
    hmin = []
    wtheta_hmin = []

    for item in items:

        hmin_temp = []
        wtheta_hmin_temp = []

        ds_stat = nc.Dataset(os.path.join(path, item), mode='r')
        wtheta = ds_stat.groups['thermo']['th_flux'][:]
        zh = ds_stat.groups['thermo']['zh'][:]

        for t in range(wtheta.shape[0]):
            wtheta_at_t = wtheta[t]
            min_wtheta = np.min(wtheta_at_t)
            min_height = zh[np.argmin(wtheta_at_t)]
            hmin_temp.append(min_height)
            wtheta_hmin_temp.append(min_wtheta)

        hmin__temp_timeavg = time_average(hmin_temp, time_avg)
        wtheta_hmin_temp_timeavg = time_average(wtheta_hmin_temp, time_avg)





        # Append the results to the respective lists
        hmin.append(hmin__temp_timeavg)
        wtheta_hmin.append(wtheta_hmin_temp_timeavg)

    # Concatenate the results along the time axis
    hmin = np.concatenate(hmin, axis=0)
    wtheta_hmin = np.concatenate(wtheta_hmin, axis=0)

    return hmin, wtheta_hmin

In [15]:
path = 'C:/Users/anton/Desktop/Career/LEAP_nyc_Summer2024/les_sim_2/'

# example_df = nc.Dataset(os.path.join(path, 'Ug16Q003_IV.nc'), mode='r')

# wtheta_temp = example_df.groups['thermo']['th_flux'][:]
# zh = example_df.variables['zh'][:]
# wtheta_temp.shape

# hmin_temp = []
# wtheta_min_temp = []

# for t in range(wtheta_temp.shape[0]):
#     wtheta_at_t = wtheta_temp[t]
#     min_wtheta = np.min(wtheta_at_t)
#     min_height = zh[np.argmin(wtheta_at_t)]
#     hmin_temp.append(min_height)
#     wtheta_min_temp.append(min_wtheta)





[876.0, 882.0, 882.0, 870.0, 858.0, 870.0, 846.0, 900.0, 876.0, 900.0, 882.0, 870.0, 858.0, 912.0, 918.0, 894.0, 876.0, 864.0, 876.0, 894.0, 930.0, 972.0, 960.0, 936.0, 930.0, 906.0, 918.0, 924.0, 900.0, 918.0, 918.0, 936.0, 924.0, 906.0, 906.0, 918.0, 942.0, 948.0, 960.0, 954.0, 936.0, 984.0, 984.0, 996.0, 990.0, 894.0, 936.0, 918.0, 906.0, 888.0, 876.0, 846.0, 984.0, 936.0, 936.0, 954.0, 930.0, 900.0, 912.0, 918.0, 912.0, 912.0, 966.0, 966.0, 978.0, 954.0, 966.0, 936.0, 954.0, 942.0, 966.0, 936.0, 924.0, 906.0, 972.0, 966.0, 948.0, 924.0, 924.0, 948.0, 972.0, 1014.0, 1008.0, 1002.0, 984.0, 960.0, 978.0, 972.0, 972.0, 954.0, 978.0, 1002.0, 1044.0, 966.0, 954.0, 990.0, 966.0, 984.0, 984.0, 972.0, 954.0, 948.0, 966.0, 990.0, 1050.0, 1026.0, 954.0, 1002.0, 990.0, 948.0, 954.0, 924.0, 930.0, 936.0, 1026.0, 1014.0, 996.0, 978.0, 1038.0, 1020.0]
[-0.01394034040636906, -0.01339567088252956, -0.01220588938803464, -0.011673264932567241, -0.011677776304173326, -0.012340427794862817, -0.01211350

In [None]:
def make_wtheta_hmin(path, items, time_avg):
    """
    Extracts the variables of interest from the LES simulations and averages them over specified number of time steps.

    Parameters:
    - path (str): The path to the LES simulation data.
    - items (list): A list of file names containing the LES simulation data. Ex: ['Ug16Q001_IV.nc', 'Ug16Q003_IV.nc', 'Ug16Q006_I.nc']
    - time_avg (int): The number of time steps over which to average.

    Returns:
    - time averaged wtheta_hmins
    - time averaged hmins
    """

    # Initialize empty lists to accumulate results
    hmin = []
    wtheta_hmin = []

    for item in items:

        hmin_temp = []
        wtheta_hmin_temp = []

        ds_stat = nc.Dataset(os.path.join(path, item), mode='r')
        wtheta = ds_stat.groups['thermo']['th_flux'][:]
        zh = ds_stat.variables['zh'][:]

        for t in range(wtheta.shape[0]):
            wtheta_at_t = wtheta[t]
            min_wtheta = np.min(wtheta_at_t)
            min_height = zh[np.argmin(wtheta_at_t)]
            hmin_temp.append(min_height)
            wtheta_hmin_temp.append(min_wtheta)

        hmin__temp_timeavg = time_average(np.array(hmin_temp), time_avg)
        wtheta_hmin_temp_timeavg = time_average(np.array(wtheta_hmin_temp), time_avg)

        # Append the results to the respective lists
        hmin.append(hmin__temp_timeavg)
        wtheta_hmin.append(wtheta_hmin_temp_timeavg)

    # Concatenate the results along the time axis
    hmin = np.concatenate(hmin, axis=0)
    wtheta_hmin = np.concatenate(wtheta_hmin, axis=0)

    return hmin, wtheta_hmin





#actually using the function here 
hmin, wtheta_min = make_wtheta_hmin(path, selected_files, time_avg)