# Equation Discovery with Symbolic Regression to Paramterize Heat Flux in the Atmospheric Boundary Layer

*Authors: Antony Sikorski*

This notebook should make it easy to perform equation discovery with the use of the function `discover_eqs`. 

This function uses a number of supporting functions from the accompanying `functions.py` file, and should output a dataframe of possible equations.

We use the `PySR` package for symbolic regression, an ML method for finding interpretable symbolic expressions.

In [1]:
#importing libraries
# from pdfs import *
import os
import re

import xarray as xr
import netCDF4 as nc
import h5netcdf

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd

import pysr
from pysr import PySRRegressor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from functions import *

Detected IPython. Loading juliacall extension. See https://juliapy.github.io/PythonCall.jl/stable/compat/#IPython


Some quick data processing: 

In [2]:
path = 'C:/Users/anton/Desktop/Career/LEAP_nyc_Summer2024/les_sim_2/'

directories, items = list_directories_files(path)
print("Directories starting with 'Ug':", directories)
print("Files starting with 'Ug':", items)

Directories starting with 'Ug': []
Files starting with 'Ug': ['Ug16Q000_IV.nc', 'Ug16Q001_IV.nc', 'Ug16Q003_IV.nc', 'Ug16Q006_I.nc', 'Ug16Q006_IV.nc', 'Ug16Q010_IV.nc', 'Ug1Q01_II.nc', 'Ug2Q010_I.nc', 'Ug2Q010_IV.nc', 'Ug2Q01_III.nc', 'Ug2Q024_I.nc', 'Ug8Q003_II.nc', 'Ug8Q003_III.nc', 'Ug8Q003_IV.nc', 'Ug8Q006_IV.nc']


In [3]:
for item in items:
    ds_stat = nc.Dataset( os.path.join(path, item), mode='r')
    if 'budget' in ds_stat.groups:
        print ("budget is in", item)
    else:
        print ("budget is not in", item)

budget is not in Ug16Q000_IV.nc
budget is in Ug16Q001_IV.nc
budget is in Ug16Q003_IV.nc
budget is in Ug16Q006_I.nc
budget is in Ug16Q006_IV.nc
budget is in Ug16Q010_IV.nc
budget is in Ug1Q01_II.nc
budget is in Ug2Q010_I.nc
budget is in Ug2Q010_IV.nc
budget is in Ug2Q01_III.nc
budget is in Ug2Q024_I.nc
budget is in Ug8Q003_II.nc
budget is in Ug8Q003_III.nc
budget is in Ug8Q003_IV.nc
budget is in Ug8Q006_IV.nc


In [4]:
for item in items[1:]:
    print(item)
    df = nc.Dataset(path + '/' + item, mode='r')
    print(df)

Ug16Q001_IV.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): z(384), zh(385), time(120)
    variables(dimensions): float64 time(time), float64 z(z), float64 zh(zh)
    groups: default, thermo, budget
Ug16Q003_IV.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): z(384), zh(385), time(120)
    variables(dimensions): float64 time(time), float64 z(z), float64 zh(zh)
    groups: default, thermo, budget
Ug16Q006_I.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): z(256), zh(257), time(120)
    variables(dimensions): float64 time(time), float64 z(z), float64 zh(zh)
    groups: default, thermo, budget
Ug16Q006_IV.nc
<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): z(384), zh(385), time(90)
    variables(dimensions): float64 time(time), float64 z(z), f

In [5]:
target_z_dim = 384

selected_files = []

for item in items[1:]:
        df = nc.Dataset(os.path.join(path, item), mode='r')
        if df.dimensions['z'].size == target_z_dim:
            selected_files.append(item)
        df.close()


print("Total number of files: ", len(items))
print("Number of valid files (same z and zh dims): ", len(selected_files))
print("Valid files: ", selected_files)

Total number of files:  15
Number of valid files (same z and zh dims):  11
Valid files:  ['Ug16Q001_IV.nc', 'Ug16Q003_IV.nc', 'Ug16Q006_IV.nc', 'Ug16Q010_IV.nc', 'Ug1Q01_II.nc', 'Ug2Q010_IV.nc', 'Ug2Q01_III.nc', 'Ug8Q003_II.nc', 'Ug8Q003_III.nc', 'Ug8Q003_IV.nc', 'Ug8Q006_IV.nc']


In [6]:
eqs_Ug16Q001_IV = discover_eqs(path, ['Ug16Q001_IV.nc'], time_avg = 15, indices = np.s_[:, 0:200], difficulty = "hard")

Compiling Julia backend...


[ Info: Started!



Expressions evaluated per second: 1.590e+04
Head worker occupation: 7.0%
Progress: 117 / 120000 total iterations (0.098%)
Hall of Fame:
---------------------------------------------------------------------------------------------------
Complexity  Loss       Score     Equation
1           1.181e-05  1.594e+01  y = dTheta_dz
3           2.075e-08  3.172e+00  y = sigma_th * wtheta
5           3.439e-09  8.987e-01  y = (wtheta - dTheta_dz) * sigma_th
7           2.683e-09  1.242e-01  y = (sigma_th * sigma_th) * -0.29921
9           2.448e-09  4.586e-02  y = (sigma_th * 0.26262) * (wtheta - sigma_th)
13          2.421e-09  2.764e-03  y = (sigma_th * sigma_th) * ((sigma_2 + sigma_th) - (0.50103 -...
                                   sigma_th))
17          2.380e-09  4.279e-03  y = ((sigma_th * sigma_th) - (sigma_th - sigma_th)) * ((sigma_...
                                  2 + sigma_th) - (0.51301 - sigma_th))
-----------------------------------------------------------------------------

In [19]:
display(eqs_Ug16Q001_IV.iloc[8, 4])
eqs_Ug16Q001_IV

(dTheta_dz*sigma_2 - 0.03334735*sigma_th)*(sigma_2 + 6.26801545166097*sigma_th + ustar)

Unnamed: 0,complexity,loss,score,equation,sympy_format,lambda_format
0,1,1.181084e-05,0.0,dTheta_dz,dTheta_dz,PySRFunction(X=>dTheta_dz)
1,3,2.07524e-08,3.172056,sigma_th * wtheta,sigma_th*wtheta,PySRFunction(X=>sigma_th*wtheta)
2,5,3.439345e-09,0.89869,(wtheta - dTheta_dz) * sigma_th,sigma_th*(-dTheta_dz + wtheta),PySRFunction(X=>sigma_th*(-dTheta_dz + wtheta))
3,7,2.683014e-09,0.12417,(sigma_th * sigma_th) * -0.29920682,-0.29920682*sigma_th**2,PySRFunction(X=>-0.29920682*sigma_th**2)
4,9,6.663191e-10,0.696464,(dTheta_dz * sigma_2) - (sigma_th * 0.03334735),dTheta_dz*sigma_2 - 0.03334735*sigma_th,PySRFunction(X=>dTheta_dz*sigma_2 - 0.03334735...
5,11,5.99871e-10,0.052527,(dTheta_dz * (sigma_2 - wtheta)) - (sigma_th *...,dTheta_dz*(sigma_2 - wtheta) - 0.03334735*sigm...,PySRFunction(X=>dTheta_dz*(sigma_2 - wtheta) -...
6,15,4.730977e-10,0.059353,(ustar + (sigma_th / sigma_2)) * ((sigma_2 * d...,(ustar + sigma_th/sigma_2)*(dTheta_dz*sigma_2 ...,PySRFunction(X=>(ustar + sigma_th/sigma_2)*(dT...
7,17,3.436851e-10,0.159788,((sigma_2 * dTheta_dz) - (0.03551055 * sigma_t...,(6.27058594173509*sigma_th + ustar)*(dTheta_dz...,PySRFunction(X=>(6.27058594173509*sigma_th + u...
8,19,2.285223e-10,0.204046,((ustar + sigma_2) + (sigma_th / 0.15954013)) ...,(dTheta_dz*sigma_2 - 0.03334735*sigma_th)*(sig...,PySRFunction(X=>(dTheta_dz*sigma_2 - 0.0333473...


WHAT IF USTAR IS SUPPOSED TO BE IN FRONT OF THE MULTIPLIED THIRD TERM

In [8]:
eqs_Ug2Q010_IV = discover_eqs(path, ['Ug2Q010_IV.nc'], time_avg = 15, indices = np.s_[:, 0:200], difficulty = "hard")

[ Info: Started!



Expressions evaluated per second: 2.480e+04
Head worker occupation: 17.4%
Progress: 152 / 120000 total iterations (0.127%)
Hall of Fame:
---------------------------------------------------------------------------------------------------
Complexity  Loss       Score     Equation
1           1.000e+02  1.594e+01  y = q
3           3.266e-07  7.958e+00  y = dTheta_dz * wtheta
5           7.771e-08  7.179e-01  y = sigma_th * -0.024326
7           7.580e-08  1.247e-02  y = ustar * (-0.13919 * sigma_th)
9           3.018e-08  4.604e-01  y = (ustar * sigma_th) * ((sigma_th - ustar) - wtheta)
11          1.998e-08  2.063e-01  y = (sigma_th / q) * ((wtheta + ustar) * -1.1982)
17          1.975e-08  1.900e-03  y = ((sigma_th * ustar) / (1.468 + sigma_th)) * ((sigma_th - s...
                                  igma_th) - (ustar + wtheta))
19          1.611e-08  1.020e-01  y = ((sigma_th * -1.0634) / (q - wtheta)) * ((wtheta - -0.2449...
                                  5) - (sigma_th - wtheta))


In [28]:
display(eqs_Ug2Q010_IV.iloc[9, 4])
eqs_Ug2Q010_IV

(-sigma_th - 0.060845807)*(1.4903882*sigma_th + 0.858827262784395*wtheta)/(q + sigma_2)

Unnamed: 0,complexity,loss,score,equation,sympy_format,lambda_format
0,1,3.128469e-05,0.0,dTheta_dz,dTheta_dz,PySRFunction(X=>dTheta_dz)
1,3,3.266173e-07,2.281047,dTheta_dz * wtheta,dTheta_dz*wtheta,PySRFunction(X=>dTheta_dz*wtheta)
2,5,7.575952e-08,0.730613,sigma_th * -0.02604297,-0.02604297*sigma_th,PySRFunction(X=>-0.02604297*sigma_th)
3,7,3.936634e-08,0.327326,0.025936155 * (dTheta_dz - sigma_th),0.025936155*dTheta_dz - 0.025936155*sigma_th,PySRFunction(X=>0.025936155*dTheta_dz - 0.0259...
4,9,1.111873e-08,0.63214,(ustar * sigma_th) * (-0.10220394 - wtheta),sigma_th*ustar*(-wtheta - 0.10220394),PySRFunction(X=>sigma_th*ustar*(-wtheta - 0.10...
5,13,6.178918e-09,0.146872,(ustar * (0.0033103146 + sigma_th)) * (-0.0933...,ustar*(sigma_th + 0.0033103146)*(-wtheta - 0.0...,PySRFunction(X=>ustar*(sigma_th + 0.0033103146...
6,15,3.091617e-09,0.346224,(wtheta - (sigma_th / -0.574465)) / (q / (-0.0...,(-sigma_th - 0.0429406)*(1.74075008921344*sigm...,PySRFunction(X=>(-sigma_th - 0.0429406)*(1.740...
7,17,2.983598e-09,0.017782,(wtheta - (sigma_th / -0.574465)) / ((q + sigm...,(-sigma_th - 0.0429406)*(1.74075008921344*sigm...,PySRFunction(X=>(-sigma_th - 0.0429406)*(1.740...
8,19,2.799467e-09,0.031851,((1.4903882 * sigma_th) - (wtheta * -0.7897615...,(-sigma_th - 0.060845807)*(1.4903882*sigma_th ...,PySRFunction(X=>(-sigma_th - 0.060845807)*(1.4...
9,21,2.464346e-09,0.063751,((1.4903882 * sigma_th) - (wtheta / -1.1643785...,(-sigma_th - 0.060845807)*(1.4903882*sigma_th ...,PySRFunction(X=>(-sigma_th - 0.060845807)*(1.4...
