# 2025 Symbolic regression Monod paper
# Running random-forest machine-learning

## Locating data

In [None]:
cwd = "/scratch/project_2000746/anthosun/2025SRMO"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
from sklearn.ensemble import RandomForestRegressor

In [None]:
files = sorted(listdir("{}/data".format(cwd)))

## Functions for the random-forest regression loop

In [None]:
def random_forest_regression(RFregressor, name, feature_set, train_x, train_y, test_x, test_y, ):
    
    assert len(feature_set) > 1, "Function random_forest_regression was not written for single feature feature_set!"
    local_column_name = (name[0], name[name.index("s")+1:name.index("i")], name[name.index("i")+1:], str(feature_set), )
    
    # regression
    m = f.fit(train_x[feature_set], train_y)
    
    # RFR feature importances
    local_results = pd.DataFrame([RFregressor.feature_importances_, # feature importances
                                  np.std([tree.feature_importances_ for tree in RFregressor.estimators_], axis=0), # std deviations
                                 ], index=["imp", "std"], columns=feature_set).unstack()
    
    # RFR performance results
    local_results.loc[("score", "train")] = m.score(train_x[feature_set], train_y)
    local_results.loc[("score", "test")] = m.score(test_x[feature_set], test_y)
    
    local_results = pd.DataFrame(local_results, columns=[local_column_name])
    
    return m, local_results

In [None]:
def R2_figure(name, feature_sets, data_obs, data_pred, data_colour,):
    
    assert len(feature_sets) == 4
    
    fig = plt.figure(figsize=(8,8), constrained_layout=True)
    fig.suptitle(name)
    fig.supxlabel(r"Test data")
    fig.supylabel(r"Prediction")
    gs = fig.add_gridspec(2, 2)
    
    for fts, feature_set in enumerate(feature_sets): # partial R2-plotting
        ax = fig.add_subplot(gs[fts//2, fts%2])
        ax.plot([xmin-1, xmax+1], [xmin-1, xmax+1], c="k", alpha=0.3)
        ax.scatter(data_obs, data_pred[:,fts], s=3, c=data_colour) # plot test dataset
        ax.set_xlim(xmin, xmax)
        ax.set_ylim(xmin, xmax)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_ylabel(feature_set)
        
    # saving R2-plot
    savename = "{}/rfml/plot_RFR_R2/{}.png".format(cwd, name)
    plt.savefig(savename, facecolor='w', edgecolor='w', transparent=False, bbox_inches="tight")
    plt.close()

In [None]:
def rho_v_t_figure(name, feature_sets, data_time, data_obs, data_pred, data_colour,):
    # [rho v. t]-figure
    
    assert len(feature_sets) == 4
    
    fig = plt.figure(figsize=(8,8), constrained_layout=True)
    fig.suptitle(name)
    fig.supxlabel(r"Test data")
    fig.supylabel(r"Prediction")
    gs = fig.add_gridspec(2, 2)
    
    for fts, feature_set in enumerate(feature_sets): # partial [rho v. t]-plotting
        ax = fig.add_subplot(gs[fts//2, fts%2])
        ax.plot(data_time, data_obs, alpha=0.1, c="k") # plot test dataset
        ax.scatter(data_time, data_pred[:,fts], s=3, c=data_colour) # plot test dataset
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_ylabel(feature_set)
        
    # saving [rho v. t]-plot
    savename = "{}/rfml/plot_RFR_rhovt/{}.png".format(cwd, name)
    plt.savefig(savename, facecolor='w', edgecolor='w', transparent=False, bbox_inches="tight")
    plt.close()

## Random-forest regression loop

In [None]:
n_estimators = 100
max_depth = 9 # max_depth for random-forest regressor trees

testing_part = 4 # testing 1/4, training based on 3/4
assert isinstance(testing_part, int)

In [None]:
f = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth)

feature_sets = [["N", "C"],
                ["N", "C", "t"],
                ["Nc", "C"],
                ["Nc", "C", "t"],
               ]
index = {("score", "train"), ("score", "test")}
for feature_set in feature_sets:
    for feature in feature_set:
        index.add((feature, "imp"))
        index.add((feature, "std"))

index = pd.MultiIndex.from_tuples(index, names = ["feat", "val"])

In [None]:
RFR_results = pd.DataFrame(index=index)
RFR_trees = np.zeros((len(files), len(feature_sets), n_estimators, 4)) # (node count, capacity, maxdepth, n leaves)

for fil, file in enumerate(files):
    
    name = file[:file.index(".csv")]
    print(name + "     ", end="\r")
    data = pd.read_csv("{}/data/{}".format(cwd, file), sep=",")

    # data splitting into train/test sets
    concs = sorted(list(data["C"].unique()))
    data["colour"] = data["C"].apply(lambda x: concs.index(x))
    mask_test = data["colour"] % testing_part == 0 # concentration values used for testing the RFR
    data_train, data_test = data[~mask_test], data[mask_test]
    
    R2_ys = np.zeros((data.shape[0], len(feature_sets)))
    xmin, xmax = data["rho"].min(), data["rho"].max()
    
    for fts, feature_set in enumerate(feature_sets):
        m, local_results = random_forest_regression(f, name, feature_set,
                                                    data_train, data_train["rho"],
                                                    data_test, data_test["rho"],)
        
        tree_traits = [[tree.tree_.node_count, tree.tree_.capacity, tree.tree_.max_depth, tree.tree_.n_leaves,] for tree in m.estimators_]
        RFR_trees[fil,fts] = np.array(tree_traits)
        
        RFR_results = pd.concat([RFR_results, local_results], axis=1)
        
        R2_ys[:,fts] = m.predict(data[feature_set]) # prediction on the whole dataset
        
    xmin, xmax = min(xmin, np.min(R2_ys)), max(xmax, np.max(R2_ys))
    
    R2_figure(name, feature_sets, data_test["rho"], R2_ys[mask_test], data_test["colour"],)
    rho_v_t_figure(name, feature_sets, data_test["t"], data_test["rho"], R2_ys[mask_test], data_test["colour"],)

np.save("{}/rfml/RFR_trees{}.npy".format(cwd, "" if max_depth is None else max_depth), RFR_trees)

In [None]:
RFR_results.sort_index(axis=0, inplace=True)
RFR_results.sort_index(axis=1, inplace=True)
super_index = pd.MultiIndex.from_tuples(RFR_results.columns, names=["code", "resc", "strn", "feat"])
RFR_results.columns = super_index
RFR_results.to_csv("{}/rfml/RFR_results{}.csv".format(cwd, "" if max_depth is None else max_depth), sep=",", index=True)

## Visualisation of feature importances Fig. S3, S4

In [None]:
max_depth = None # max_depth for random-forest regressor trees

In [None]:
RFR_results = pd.read_csv("{}/rfml/RFR_results{}.csv".format(cwd, "" if max_depth is None else max_depth), sep=",", header=[0,1,2,3], index_col=[0,1], )

feats = ["['N', 'C']",
         "['N', 'C', 't']",
         "['Nc', 'C']",
         "['Nc', 'C', 't']",
        ]

df1 = RFR_results.fillna(0).T

df = pd.DataFrame()
df["R2NorNc"] = df1[[("N", "imp"), ("Nc", "imp")]].sum(axis=1) * df1[("score", "test")]
df["R2other"] = df1[[("C", "imp"), ("t", "imp")]].sum(axis=1) * df1[("score", "test")]
df["train_score"] = df1[("score", "train")]
df["std"] = df1[[("N", "std"), ("Nc", "std")]].sum(axis=1) * df1[("score", "test")]

df = df.reorder_levels(["feat","resc","code","strn"]).sort_index()

# reordering experimental strains and creating a dictionary of {resc:codes}
freqorder = pd.read_csv("/scratch/project_2000746/anthosun/2024SRMO/raws/Frequencies.csv", sep=",", index_col=0)["index"]
freqorder = freqorder.to_dict()

indices = []
codes = {}
for index in list(df.index):
    index = tuple([index[0], index[1], index[2], freqorder[int(index[3])] if index[2] == "R" else int(index[3])])
    
    if index[1] in codes:
        codes[index[1]] = codes[index[1]] if index[2] in codes[index[1]] else codes[index[1]] + [index[2]]
    else:
        codes[index[1]] = [index[2]]
    indices.append(index)

df.index = pd.MultiIndex.from_tuples(indices, names = df.index.names)
df.sort_index(inplace=True)

In [None]:
nstrns = 16
rescs = sorted(list(df.index.get_level_values("resc").unique()))

In [None]:
cmap = plt.colormaps["viridis"]

# figure
fig = plt.figure(figsize=(10,10), constrained_layout=True)
#fig.suptitle(r"Population feature importance ($N$ or $N_c$) $\times$ test score ($R^2$) for the random-forest regression{}".format("" if max_depth is None else " (max depth: {})".format(max_depth)))
fig.supxlabel(r"data set")
fig.supylabel(r"feature set")

gs = fig.add_gridspec(len(feats), 1)

for fts, feat in enumerate(feats):
    
    # plotting
    ax = fig.add_subplot(gs[fts,:])
    
    width = 1
    gap = width / 2
    xoffset = width / 2
    blocks = []
    
    for resc in rescs:
        for code in codes[resc]:
            blocks.append("{}s{}".format(code, resc))
            df1 = df.xs((feat, resc, code), level=["feat", "resc", "code"])
            # row parameters
            x = xoffset + np.arange(nstrns) * width
            colour = cmap( float(resc) / 6)
            # drawing rectangles
            yoffset = np.zeros(len(df1.index))
            for h, heights in enumerate(["R2NorNc", "R2other"]):
                p = ax.bar(x, df1[heights],
                           width=width, bottom=yoffset, color=colour, alpha = [1, 0.5,][h], )
                yoffset += df1[heights]
            # drawing standard deviation lines
            xs = x
            ymins, ymaxs = np.array([[-1], [1.]]) @ df1["std"].to_numpy()[np.newaxis,:] + df1["R2NorNc"].to_numpy()[np.newaxis,:]
            ax.vlines(xs, ymins, ymaxs, colors = nstrns*["w"])
            # marking train score
            ax.scatter(x, df1["train_score"], s=1, c=nstrns*[colour])
            xoffset += nstrns * width + gap

    # ax parameters
    ax.set(xlim = (0, xoffset - gap - width / 2), ylim= (0, 1.05), xticks=[],
           ylabel= {"['N', 'C']": r"{$C$, $N$}", "['N', 'C', 't']": r"{$C$, $N$, $t$}", "['Nc', 'C']": r"{$C$, $N_c$}", "['Nc', 'C', 't']": r"{$C$, $N_c$, $t$}", }[feat], )

ticks = np.arange(len(blocks)) * (nstrns + gap) + nstrns / 2
ax.set_xticks(ticks, labels= blocks)

### SAVING
name = "{}/plot/RFR_feature_imp{}.png".format(cwd, "" if max_depth is None else max_depth)
plt.savefig(name, facecolor='w', edgecolor='w', transparent=False, bbox_inches="tight")
plt.show()

## Visualisation of complexity (max depth) Fig. 2, S2

In [None]:
import seaborn as sns

In [None]:
xrange = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, None]

In [None]:
X = []
step = 0.2
for x in range(len(xrange)):
    X += [x - 3*step/2, x - step/2, x + step/2, x + 3*step/2]

In [None]:
RFR_test = pd.DataFrame()
RFR_train = pd.DataFrame()

for max_depth in xrange:
#    RFR_trees = np.load("{}/rfml/RFR_trees{}.npy".format(cwd, "" if max_depth is None else max_depth)
    
    RFR_results = pd.read_csv("{}/rfml/RFR_results{}.csv".format(cwd, "" if max_depth is None else max_depth), sep=",", header=[0,1,2,3], index_col=[0,1])
#    max_depth = 0 if max_depth is None else int(max_depth)
    
    RFR_local = RFR_results.xs(("score", "test")).unstack()
    RFR_local = pd.concat({str(max_depth): RFR_local}, names=["depth"], axis=1)
    RFR_test = pd.concat([RFR_test, RFR_local], axis=1)
    
    RFR_local = RFR_results.xs(("score", "train")).unstack()
    RFR_local = pd.concat({str(max_depth): RFR_local}, names=["depth"], axis=1)
    RFR_train = pd.concat([RFR_train, RFR_local], axis=1)

In [None]:
RFR_test = RFR_test.swaplevel(axis=1)
#RFR_train = RFR_train.swaplevel(axis=1)

RFR_test.sort_index(axis=1, inplace=True,
                    key=lambda x: x.map({"['N', 'C']": 1,
                                         "['N', 'C', 't']": 2,
                                         "['Nc', 'C']": 3,
                                         "['Nc', 'C', 't']": 4,
                                         "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
                                         "10": 10, "20": 20, "30": 30, "40": 40, "50": 50, "None": 9999, }
                                       ),
                    )

RFR_train.sort_index(axis=1, inplace=True,
                     key=lambda x: x.map({"['N', 'C']": 1,
                                          "['N', 'C', 't']": 2,
                                          "['Nc', 'C']": 3,
                                          "['Nc', 'C', 't']": 4,
                                          "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
                                          "10": 10, "20": 20, "30": 30, "40": 40, "50": 50, "None": 9999, }
                                         ),
                     )

In [None]:
code = "R"
resc = 0
title = "Random Forest regression on experimental data"

abcisses = "model complexity: maximal tree depth"
ordonnees = "performance"
couleurs = "feature set"

df = RFR_test.xs((code, str(resc)), level=["code", "resc"]).melt()
df.rename(columns = {"depth": abcisses, "feat": couleurs, "value": ordonnees}, inplace=True) # variable-names
df[couleurs] = df[couleurs].apply(lambda x: {"['N', 'C']": r"{$C$, $N$}",
                                             "['N', 'C', 't']": r"{$C$, $N$, $t$}",
                                             "['Nc', 'C']": r"{$C$, $N_c$}",
                                             "['Nc', 'C', 't']": r"{$C$, $N_c$, $t$}",
                                             }[x])

fig = plt.figure(figsize=(8,5), constrained_layout=True)
gs = fig.add_gridspec(1, 1)

ax = fig.add_subplot(gs[:,:],)
ax = sns.boxplot(data=df, y=ordonnees, x=abcisses, hue=couleurs, showfliers=False, ax=ax, )
ax.scatter(X, RFR_train.xs(code, level="code").median(), marker="*", c="k", zorder=4,)

ax.set_ylim(0,1)
ax.set_title(title)

### SAVING
name = "{}/plot/RFR_max_depths_{}s{}.png".format(cwd, code, resc)
plt.savefig(name, facecolor='w', edgecolor='w', transparent=False, bbox_inches="tight")
plt.show()