# 2025 Symbolic regression Monod paper
# Posttreatment of symbolic regression equations

## Locating data

In [None]:
cwd = "/scratch/project_2000746/anthosun/2025SRMO"

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
#import itertools
from sympy import *
from IPython.display import display # to display symbolic expressions

import os
#import string
#from collections import Counter

# Functions

In [None]:
from sympy.core.numbers import Integer, NegativeOne
from sympy.functions.elementary.complexes import sign
from sympy.physics.units.quantities import Quantity
from sympy.solvers.ode import constantsimp

In [None]:
t, C, n, S = symbols("t C n S", nonegative=True)
U, F = symbols("U F", positive=True)
Nc, gmax, S0 = symbols("Nc gmax S0", positive=True)
nu = symbols("nu", real=True)

symbols_dict = {"C": C,
                "F": F,
                "n": n,
                "S": S,
                "t": t,
                "U": U,
               }

In [None]:
def is_a_remarkable_number(a):
    return not isinstance(a, NegativeOne) and isinstance(a, Number)

def contains_remarkable_number(expr):
    
    for a in list(expr.atoms()):
        if is_a_remarkable_number(a):
            return True
            
    return False

def replace_numbers_by_Ks(expr):
    x = 1
    for a in preorder_traversal(expr):
        if is_a_remarkable_number(a):
            expr = expr.subs(a, sign(a) * Quantity("K{}".format(x)))
            x +=1
            
    return expr

def count_Quantitys(expr):

    out = 0
    for a in preorder_traversal(expr):
        if isinstance(a, Quantity):
            out += 1
            
    return out

def count_parameters(expr):
    
    while contains_remarkable_number(expr):
        expr = replace_numbers_by_Ks(expr)
        expr = constantsimp(expr, [i for i in expr.atoms(Quantity)]) # simplify constants
        out = count_Quantitys(expr)
    
    return out

In [None]:
def dataframe_with_node_counts(eqns, # equations
                               Sqns = None, # equations with S
                              ):
    exprs = []
    Kxprs = []
    node_counts = []
    csts_counts = []
    Sxprs = []
    CorFs = []
    
    for i, expr in enumerate(eqns):
        expr = simplify(parse_expr(expr, symbols_dict))
        exprs.append(expr)
        node_counts.append( count_expr_nodes(expr) )
        
        while contains_remarkable_number(expr):
            expr = replace_numbers_by_Ks(expr)
            expr = constantsimp(expr, [i for i in expr.atoms(Quantity)]) # simplify constants
        Kxprs.append(expr)
        csts_counts.append( count_Quantitys(expr) )
        
        if not Sqns is None:
            expr = simplify(parse_expr(Sqns[i], symbols_dict))
            Sxprs.append(expr)
            CorFs.append( (C in expr.atoms()) or (F in expr.atoms()) )
    
    df = pd.DataFrame([exprs, node_counts, Kxprs, csts_counts,], index=["eq", "nd", "Kq", "ct"], ) if Sqns is None else pd.DataFrame([exprs, node_counts, Kxprs, csts_counts, Sxprs, CorFs], index=["eq", "nd", "Kq", "ct", "eq(S)", "C?F?"])
    
    return df.T

In [None]:
def retrieve_loss_norm(name):

    fn = "/scratch/project_2000746/anthosun/2024SRMO/data/{}.csv".format(name)
    norm_factor = np.var(pd.read_csv(fn)["rho"], ddof=1)
    
    return norm_factor

# Extract parameter number from models learnt using template 1 (no constraints)

In [None]:
cwd2 = "/scratch/project_2000746/kiciadov/SR_monod/real_data_inference/results"
template = cwd2[-1]

files = []
for file in os.listdir(cwd2):
    if file[-4:] == ".csv":
        files.append(os.path.join(cwd2, file))
        
for f, file in enumerate(files):
    data = pd.read_csv(file, index_col=False )
    file = file[-file[::-1].index("/"):]
    name = file[:file.index("_")]
    
    eqns = data["Equation"].apply(lambda x: x.replace("^", "**") )

    exprs = []
    csts_counts = []
    for i, expr in enumerate(eqns):
        expr = simplify(parse_expr(expr, symbols_dict))
        exprs.append(expr)
        csts_counts.append( count_parameters(expr) ) # count constants in each model
        
    data["csts"] = csts_counts
    data["strn"] = name
    data["feat"] = file[file.index("_")+1:file.index("_rho")]
    #mega = df if f == 0 else pd.concat([mega, df], axis = 0)
    data.to_csv("{}/SR_naive_results/{}".format(cwd, file), index=False)

# Posttreatment of retrieved expressions

## Template 1

In [None]:
cwd2 = "/scratch/project_2000746/kiciadov/SR_monod/real_data_inference/v1.4.0/results/final_model_1"
template = cwd2[-1]

files = []
for file in os.listdir(cwd2):
    if os.path.isdir(os.path.join(cwd2, file)):
        files.append(os.path.join(cwd2, file))

for file in files:
    name = file[-file[::-1].index("-"):]
    eqns = pd.read_csv(file + "/hall_of_fame.csv", index_col=False )
    
    eqns["eq"] = eqns["Equation"].apply(lambda x: x.replace("^", "**") )
    eqns["eq(S)"] = eqns["eq"].apply(lambda expr: str(expand(simplify(parse_expr(expr, symbols_dict)))).replace("C*F", "S").replace("F*C", "S"), )
    
    df = dataframe_with_node_counts(eqns["eq"], Sqns = eqns["eq(S)"])
    df = pd.concat([eqns["Loss"] * retrieve_loss_norm(name), df], axis=1, )
    df.to_csv("{}/sreq/template_{}_{}.csv".format(cwd, template, name))

## Template 2

In [None]:
cwd2 = "/scratch/project_2000746/kiciadov/SR_monod/real_data_inference/v1.4.0/results/final_model_2"
template = cwd2[-1]

files = []
for file in os.listdir(cwd2):
    if os.path.isdir(os.path.join(cwd2, file)):
        files.append(os.path.join(cwd2, file))

for file in files:
    name = file[-file[::-1].index("-"):]
    eqns = pd.read_csv(file + "/hall_of_fame.csv", index_col=False )
    
    eqns["Equation"] = eqns["Equation"].apply(lambda x: x.replace("^", "**") )
    eqns.loc[:,["q", "m", "g",]] = eqns.loc[:,"Equation"].apply(lambda x: [y[y.index("= ")+2:] for y in x.split(";")] ).to_list()
    eqns["gr"] = eqns["g"].apply(lambda x: x.replace("#1", "F").replace("#2", "C") )
    eqns["gr(S)"] = eqns["gr"].apply(lambda expr: str(expand(simplify(parse_expr(expr, symbols_dict)))).replace("C*F", "S").replace("F*C", "S"), )
    eqns["ad"] = eqns.loc[:,["q", "m"]].apply(lambda x: "{}/({}+U**{})".format(x.iloc[0], x.iloc[0], x.iloc[1],) , axis=1 )
    eqns["eq"] = eqns.loc[:,["ad", "gr"]].apply(lambda x: "{}*{}".format(x.iloc[0], x.iloc[1],) , axis=1 )
    eqns["eq(S)"] = eqns.loc[:,["ad", "gr(S)"]].apply(lambda x: "{}*{}".format(x.iloc[0], x.iloc[1],) , axis=1 )
    
    df = dataframe_with_node_counts(eqns["eq"], Sqns = eqns["eq(S)"])
    df = pd.concat([eqns["Loss"] * retrieve_loss_norm(name), df], axis=1, )
    df.to_csv("{}/sreq/template_{}_{}.csv".format(cwd, template, name))

## Template 3

In [None]:
cwd2 = "/scratch/project_2000746/kiciadov/SR_monod/real_data_inference/v1.4.0/results/final_model_3"
template = cwd2[-1]

files = []
for file in os.listdir(cwd2):
    if os.path.isdir(os.path.join(cwd2, file)):
        files.append(os.path.join(cwd2, file))

for file in files:
    name = file[-file[::-1].index("-"):]
    eqns = pd.read_csv(file + "/hall_of_fame.csv", index_col=False )
    
    eqns["Equation"] = eqns["Equation"].apply(lambda x: x.replace("^", "**") )
    eqns.loc[:,["q", "m", "g", "h"]] = eqns.loc[:,"Equation"].apply(lambda x: [y[y.index("= ")+2:] for y in x.split(";")] ).to_list()
    eqns["h"] = eqns["h"].apply(lambda x: x.replace("#1", "F") )
    eqns["gr"] = eqns.loc[:,["g", "h"]].apply(lambda x: x.iloc[0].replace("#1", "(C*({}))".format(x.iloc[1])), axis=1 )
    eqns["h(S)"] = eqns["h"].apply(lambda expr: str(expand(C * simplify(parse_expr(expr, symbols_dict)))).replace("C*F", "S").replace("F*C", "S"), )
    eqns["gr(S)"] = eqns.loc[:,["g", "h(S)"]].apply(lambda x: x.iloc[0].replace("#1", x.iloc[1]), axis=1 )
    eqns["ad"] = eqns.loc[:,["q", "m"]].apply(lambda x: "{}/({}+U**{})".format(x.iloc[0], x.iloc[0], x.iloc[1],) , axis=1 )
    eqns["eq"] = eqns.loc[:,["ad", "gr"]].apply(lambda x: "{}*{}".format(x.iloc[0], x.iloc[1],) , axis=1 )
    eqns["eq(S)"] = eqns.loc[:,["ad", "gr(S)"]].apply(lambda x: "{}*{}".format(x.iloc[0], x.iloc[1],) , axis=1 )
    
    df = dataframe_with_node_counts(eqns["eq"], Sqns = eqns["eq(S)"])
    df = pd.concat([eqns["Loss"] * retrieve_loss_norm(name), df], axis=1, )
    df.to_csv("{}/sreq/template_{}_{}.csv".format(cwd, template, name))

## Collect all templates into a single file

In [None]:
files = []
for file in os.listdir(cwd):
    files.append(file[-file[::-1].index("_"):])
files = list(set(files))

In [None]:
for file in files:
    
    for template in range(3):
        eqs = pd.read_csv("{}/sreq/template_{}_{}".format(cwd, template + 1, file), index_col=False )
        
        for i, nd in enumerate(sorted(list(eqs["ct"].unique()))):
            subdf = eqs.loc[eqs["ct"] == nd]
            df = subdf.loc[subdf["Loss"].idxmin(),:] if i == 0 else pd.concat([df, subdf.loc[subdf["Loss"].idxmin(), :],], axis=1)
        df.loc["template"] = template
        df.loc["logloss"] = np.log10( df.loc["Loss"].astype(float) )
        eqns = df if template == 0 else pd.concat([eqns, df], axis=1)
    
    eqns = eqns.T
    eqns.to_csv("{}/symb/{}".format(cwd, file))

# Pareto front Fig 4

In [None]:
colours = ["b", "g", "magenta"]

labels = [r"$\rho_{obs} (C, N_c, t)$",
          r"$\alpha (t) \times \rho (C,N_c)$",
          r"$\alpha (t) \times \rho [ C \times h (N_c) ]$",
         ]

file = "Rs0i2160.csv"
name = "Rs0i2160"

In [None]:
data = pd.read_csv("{}/symb/{}".format(cwd, file), index_col=False )
data["Kq"] = data["Kq"].apply(lambda x: str(x).replace("F", "(e**-Nc)").replace("U", "(e**-t)"), )
data["colour"] = data["template"].apply(lambda x: colours[x])
xmin, xmax, ymin, ymax = max(data["ct"].min(), 0), data["ct"].max(), data["logloss"].min(), data["logloss"].max()
xrange, yrange = abs(xmax - xmin), abs(ymax - ymin)
xmin, xmax, ymin, ymax = xmin - 0.05 * xrange, xmax + 0.05 * xrange, ymin - 0.05 * yrange, ymax + 0.05 * yrange

In [None]:
fig = plt.figure(figsize=(10,5), constrained_layout=True)
ax = fig.add_subplot()

# scatter plot
for c, colour in enumerate(data["colour"].unique()):
    df = data.loc[data["colour"] == colour]
    ax.scatter(df["ct"], df["logloss"], c=colour, edgecolor=df["colour"], label=labels[c], alpha=0.5, zorder = c, )

# equations
for j in range(data.shape[0]):
    expr = simplify(parse_expr(data.loc[j,"Kq"], symbols_dict))
    tpl = data.loc[j,"template"]
    ax.annotate(r"${}$".format(latex(expr)),
                xy = (data.loc[j,"ct"], data.loc[j,"logloss"]),
                xytext = (data.loc[j,"ct"] + [0, 0, 0][tpl % 3] * xrange,
                          data.loc[j,"logloss"] + [0, 0, 0][tpl % 3] * yrange,
                         ),
               )
        
# layout
ax.legend(loc="best", fancybox=True, title="symbolic regression template")
ax.set(xlim=(xmin, xmax), xticks=np.arange(data["ct"].max())+1, ylim=(ymin, ymax), xlabel="number of model parameters", ylabel="logarithm of mean absolute error", title=name)
### SAVING
path = "{}/symb/Pareto_{}.svg".format(cwd, name)
plt.savefig(path, facecolor='w', edgecolor='w', transparent=False, bbox_inches="tight")
plt.show()