# Equation Discovery with Symbolic Regression to Paramterize Heat Flux in the Atmospheric Boundary Layer

*Authors: Antony Sikorski*

This notebook should make it easy to perform equation discovery with the use of the function `discover_eqs`. 

This function uses a number of supporting functions from the accompanying `functions.py` file, and should output a dataframe of possible equations.

We use the `PySR` package for symbolic regression, an ML method for finding interpretable symbolic expressions.

In [None]:
#importing libraries
# from pdfs import *
import os
import re

import xarray as xr
import netCDF4 as nc
import h5netcdf

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd

import pysr
from pysr import PySRRegressor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from functions import *

Some quick data processing: 

In [None]:
path = 'C:/Users/anton/Desktop/Career/LEAP_nyc_Summer2024/les_sim_2/'

directories, items = list_directories_files(path)
print("Directories starting with 'Ug':", directories)
print("Files starting with 'Ug':", items)

In [None]:
for item in items:
    ds_stat = nc.Dataset( os.path.join(path, item), mode='r')
    if 'budget' in ds_stat.groups:
        print ("budget is in", item)
    else:
        print ("budget is not in", item)

In [None]:
for item in items[1:]:
    print(item)
    df = nc.Dataset(path + '/' + item, mode='r')
    print(df)

In [None]:
target_z_dim = 384

selected_files = []

for item in items[1:]:
        df = nc.Dataset(os.path.join(path, item), mode='r')
        if df.dimensions['z'].size == target_z_dim:
            selected_files.append(item)
        df.close()


print("Total number of files: ", len(items))
print("Number of valid files (same z and zh dims): ", len(selected_files))
print("Valid files: ", selected_files)

In [None]:
#eqs_Ug16Q001_IV = discover_eqs(path, ['Ug16Q001_IV.nc'], time_avg = 15, indices = np.s_[:, 0:200], difficulty = "hard")

In [None]:
# eqs_Ug16Q001_IV

In [None]:
#eqs_Ug2Q010_IV = discover_eqs(path, ['Ug2Q010_IV.nc'], time_avg = 15, indices = np.s_[:, 0:200], difficulty = "hard")

In [None]:
# eqs_Ug2Q010_IV

In [None]:
eqs_easy = discover_eqs(path, selected_files, time_avg = 15, indices = np.s_[:, 0:200], difficulty = "easy", normalize = True)
# eqs_medium = discover_eqs(path, selected_files, time_avg = 15, indices = np.s_[:, 0:200], difficulty = "medium")
# eqs_mediumhard = discover_eqs(path, selected_files, time_avg = 15, indices = np.s_[:, 0:200], difficulty = "mediumhard")
# eqs_hard = discover_eqs(path, selected_files, time_avg = 15, indices = np.s_[:, 0:200], difficulty = "hard")

In [None]:
eqs_easy

In [None]:
df_coefs = pd.DataFrame(columns = ['File', 'Avg Ustar', 'Avg Tau', 'Ug', 'Q', 'RMSE', 'R2', 'C1', 'C2', 'C3'])

for item in selected_files:
    #file
    # print(item)
    ds_stat = nc.Dataset(os.path.join(path, item), mode='r')

    #ustar
    ustar = ds_stat.groups['default'].variables['ustar'][:]
    # print("Mean of ustar: ", np.mean(ustar))

    #tau
    grr = 9.8
    T_0 = 300
    beta = grr/T_0
    pbl_height = ds_stat.groups['thermo'].variables['zi'][:]
    wtheta_surface = ds_stat.groups['thermo']['th_flux'][:,0]  
    wstar = np.power( beta * (wtheta_surface) * pbl_height , 1/3) 
    tau = pbl_height/wstar
    # print("Mean of tau: ", np.mean(tau))

    #ug and q
    ug, q = extract_ug_q(item)
    # print("Ug: ", ug)
    # print("Q: ", q)

    #rmse, r2
    fitted_model, X_train, X_test, y_train, y_test, rmse, r2, coefficients = LES_linear_regressor(path, [item], 
                                                                                                  time_avg = 15, 
                                                                                                  indices = np.s_[:, 0:200], 
                                                                                                  verbose = False)
    
    #c1, c2, c3
    c1 = coefficients[0]
    c2 = coefficients[1]
    c3 = coefficients[2]

    # Create a new row to be appended
    new_row = {
        'File': item,
        'Avg Ustar': np.mean(ustar),
        'Avg Tau': np.mean(tau),
        'Ug': ug,
        'Q': q,
        'RMSE': rmse,
        'R2': r2,
        'C1': c1,
        'C2': c2,
        'C3': c3
    }

    # Append the new row to the DataFrame
    df_coefs = pd.concat([df_coefs, pd.DataFrame([new_row])], ignore_index=True)

    # print("")

In [None]:
df_coefs['Inversion Strength'] = [4, 4, 4, 4, 2, 4, 3, 2, 3, 4, 4]
df_coefs['C1'] = df_coefs['C1']* df_coefs['Avg Tau']
df_coefs['C2'] = df_coefs['C2']/beta
df_coefs['C1'] = np.abs(df_coefs['C1'])
df_coefs['C2'] = np.abs(df_coefs['C2'])
df_coefs['C3'] = np.abs(df_coefs['C3'])

In [None]:
df_coefs

In [None]:
# Define the variables and their respective labels and colors
variables = ['Q', 'Ug', 'Avg Tau', 'Avg Ustar', 'Inversion Strength']
xlabels = ['Q', 'Ug', 'Avg Tau', 'Avg Ustar', 'SI']
titles = [
    'C1 (wtheta) Values in Relation to {}', 
    'C2 (theta2) Values in Relation to {}', 
    'C3 (multiply) Values in Relation to {}'
]
colors = ['red', 'green', 'blue']
columns = ['C1', 'C2', 'C3']

# Loop through each variable to create scatter plots
for var, xlabel in zip(variables, xlabels):
    plt.figure(figsize=(12, 4))
    
    for i, (col, color, title) in enumerate(zip(columns, colors, titles)):
        plt.subplot(1, 3, i + 1)
        plt.scatter(df_coefs[var], df_coefs[col], label=col, marker='o', color=color)
        plt.xlabel(xlabel)
        plt.ylabel(f'{col} Values')
        plt.title(title.format(xlabel))
        plt.legend()
    
    plt.tight_layout()
    plt.show()

# Special case for box plots for 'Inversion Strength'
plt.figure(figsize=(12, 4))

for i, col in enumerate(columns):
    plt.subplot(1, 3, i + 1)
    df_coefs.boxplot(column=col, by='Inversion Strength', grid=False, ax=plt.gca(), patch_artist=True)
    plt.xlabel('SI')
    plt.ylabel(f'{col} Values')
    plt.title(f'{col} (wtheta) Values in Relation to SI')
    plt.suptitle('')  # Suppress the automatic title

plt.tight_layout()
plt.show()


In [None]:
###### Is there any relationship between fit and the forcings??? ################
plt.figure(figsize=(16, 4))

# Plot R2 vs. Q
plt.subplot(1, 4, 1)
plt.scatter(df_coefs['Q'], df_coefs['R2'], label='R2', marker='o', color='red')
plt.xlabel('Q')
plt.ylabel('R2 Values')
plt.title('R2 Values in Relation to Q')
plt.legend()

# Plot R2 vs. Ug
plt.subplot(1, 4, 2)
plt.scatter(df_coefs['Ug'], df_coefs['R2'], label='R2', marker='o', color='green')
plt.xlabel('Ug')
plt.ylabel('R2 Values')
plt.title('R2 Values in Relation to Ug')
plt.legend()

# Plot R2 vs. Avg Tau
plt.subplot(1, 4, 3)
plt.scatter(df_coefs['Avg Tau'], df_coefs['R2'], label='R2', marker='o', color='blue')
plt.xlabel('Avg Tau')
plt.ylabel('R2 Values')
plt.title('R2 Values in Relation to Avg Tau')
plt.legend()

# Plot R2 vs. Avg Ustar
plt.subplot(1, 4, 4)
plt.scatter(df_coefs['Avg Ustar'], df_coefs['R2'], label='R2', marker='o', color='darkmagenta')
plt.xlabel('Avg Ustar')
plt.ylabel('R2 Values')
plt.title('R2 Values in Relation to Avg Ustar')
plt.legend()

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
###### Is there any relationship between fit and the forcings??? ################
plt.figure(figsize=(16, 4))

# Plot RMSE vs. Q
plt.subplot(1, 4, 1)
plt.scatter(df_coefs['Q'], df_coefs['RMSE'], label='RMSE', marker='o', color='red')
plt.xlabel('Q')
plt.ylabel('RMSE Values')
plt.title('RMSE Values in Relation to Q')
plt.legend()

# Plot RMSE vs. Ug
plt.subplot(1, 4, 2)
plt.scatter(df_coefs['Ug'], df_coefs['RMSE'], label='RMSE', marker='o', color='green')
plt.xlabel('Ug')
plt.ylabel('RMSE Values')
plt.title('RMSE Values in Relation to Ug')
plt.legend()

# Plot RMSE vs. Avg Tau
plt.subplot(1, 4, 3)
plt.scatter(df_coefs['Avg Tau'], df_coefs['RMSE'], label='RMSE', marker='o', color='blue')
plt.xlabel('Avg Tau')
plt.ylabel('RMSE Values')
plt.title('RMSE Values in Relation to Avg Tau')
plt.legend()

# Plot RMSE vs. Avg Ustar
plt.subplot(1, 4, 4)
plt.scatter(df_coefs['Avg Ustar'], df_coefs['RMSE'], label='RMSE', marker='o', color='purple')
plt.xlabel('Avg Ustar')
plt.ylabel('RMSE Values')
plt.title('RMSE Values in Relation to Avg Ustar')
plt.legend()

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Define the variable pairs and their respective labels
variable_pairs = [
    ('Ug', 'Q'), 
    ('Ug', 'Avg Tau'), 
    ('Ug', 'Avg Ustar'), 
    ('Q', 'Avg Tau'), 
    ('Q', 'Avg Ustar'), 
    ('Avg Tau', 'Avg Ustar')
]

xlabel_pairs = [
    ('Q', 'Ug'), 
    ('Avg Tau', 'Ug'), 
    ('Avg Ustar', 'Ug'), 
    ('Avg Tau', 'Q'), 
    ('Avg Ustar', 'Q'), 
    ('Avg Ustar', 'Avg Tau')
]

# Function to create heatmap
def create_heatmap(ax, x_grid, y_grid, C, xlabel, ylabel, title):
    heatmap = ax.pcolormesh(x_grid, y_grid, C, shading='auto', cmap='viridis')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    plt.colorbar(heatmap, ax=ax)

# Loop through each variable pair to create the plots
for (var1, var2), (xlabel, ylabel) in zip(variable_pairs, xlabel_pairs):
    # Pivot tables to create 2D arrays
    C1_grid = df_coefs.pivot_table(index=var1, columns=var2, values='C1')
    C2_grid = df_coefs.pivot_table(index=var1, columns=var2, values='C2')
    C3_grid = df_coefs.pivot_table(index=var1, columns=var2, values='C3')

    # Create a meshgrid for var1 and var2
    x_values = C1_grid.columns
    y_values = C1_grid.index
    x_grid, y_grid = np.meshgrid(x_values, y_values)

    # Plot the heatmaps
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    create_heatmap(axes[0], x_grid, y_grid, C1_grid.values, xlabel, ylabel, f'Heatmap of C1 ({xlabel} vs {ylabel})')
    create_heatmap(axes[1], x_grid, y_grid, C2_grid.values, xlabel, ylabel, f'Heatmap of C2 ({xlabel} vs {ylabel})')
    create_heatmap(axes[2], x_grid, y_grid, C3_grid.values, xlabel, ylabel, f'Heatmap of C3 ({xlabel} vs {ylabel})')

    plt.tight_layout()
    plt.show()



In [None]:
# Scatter plot of C2 and C3
plt.figure(figsize=(7, 4))
plt.scatter(df_coefs['C2'], df_coefs['C3'], marker='o', color='darkcyan', label='C2 vs C3')

# Adding labels and title
plt.xlabel('C2 Values')
plt.ylabel('C3 Values')
plt.title('Scatter Plot of C2 vs C3')
plt.legend()

# Show the plot
plt.show()


In [None]:
df_coefs

In [None]:
df_X_l = df_coefs[['Avg Ustar', 'Avg Tau', 'Ug', 'Q']]

#normalize the columns of df_X_l using the min max normalization
df_X_l = (df_X_l - df_X_l.min()) / (df_X_l.max() - df_X_l.min())

df_X_l = df_X_l.rename(columns={'Avg Ustar': 'Ustar', 'Avg Tau': 'Tau', 'Q': 'Q_ic'})
df_C1 = df_coefs['C1']
df_C2 = df_coefs['C2']
df_C3 = df_coefs['C3']

In [None]:
df_X_l

In [None]:
df_c1_eqs = discover_coef_eqs(df_X_l, df_C1)

In [None]:
df_c1_eqs

In [None]:
df_c2_eqs = discover_coef_eqs(df_X_l, df_C2)

In [None]:
df_c2_eqs

In [None]:
df_c3_eqs = discover_coef_eqs(df_X_l, df_C3)

In [None]:
df_c3_eqs

- normalize the data, then do symbolic regression. see how well that works. 
- try both min max and z score normalization

- (maybe create jittered/noisy variables for better fit, look up how much paper needs)

other things to think abt: 
- ustar in front of third term (punished coefficient complexity) (think abt)
- parametrization for dwwtheta/dz

final presi will be: 
1. the problem (background on turbulence, atmospheric boundary layer, etc)
2. the methodology (PYSR, how does it work, genetic algorithms, etc)
3. Re-disovery of the original equation.
4. Current work on improving the parametrization.

Sara will decide how she wants us to present: Hopefully either all three, or Laura and Greta together and then me separate. 