# First Steps: Basic Symbolic Regression Experiment

This notebook follows the project markdown plans by implementing the **early foundation steps**:

1. Build an expression tree representation.
2. Evaluate candidate symbolic expressions.
3. Run a basic symbolic regression experiment on synthetic data.
4. Display the selected model as a tree and format numeric values to **3 decimal places**.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Optional

np.random.seed(42)


In [None]:
def format_value(v: float) -> str:
    """Format numeric values to 3 decimal places."""
    return f"{float(v):.3f}"


@dataclass
class ExpressionNode:
    node_type: str  # 'const', 'var', 'unary', 'binary'
    value: object
    left: Optional['ExpressionNode'] = None
    right: Optional['ExpressionNode'] = None
    name: Optional[str] = None

    def evaluate(self, X: np.ndarray) -> np.ndarray:
        if self.node_type == 'const':
            return np.full(X.shape[0], float(self.value))
        if self.node_type == 'var':
            return X[:, int(self.value)]
        if self.node_type == 'unary':
            return self.value(self.left.evaluate(X))
        if self.node_type == 'binary':
            return self.value(self.left.evaluate(X), self.right.evaluate(X))
        raise ValueError(f"Unknown node_type: {self.node_type}")

    def to_infix(self) -> str:
        if self.node_type == 'const':
            return format_value(self.value)
        if self.node_type == 'var':
            return f"x_{int(self.value)}"
        if self.node_type == 'unary':
            return f"{self.name}({self.left.to_infix()})"
        if self.node_type == 'binary':
            return f"({self.left.to_infix()} {self.name} {self.right.to_infix()})"
        raise ValueError(f"Unknown node_type: {self.node_type}")


const = lambda c: ExpressionNode('const', float(c))
var = lambda i: ExpressionNode('var', int(i))
unary = lambda fn, a, name: ExpressionNode('unary', fn, left=a, name=name)
binary = lambda fn, a, b, name: ExpressionNode('binary', fn, left=a, right=b, name=name)


def print_tree(node: ExpressionNode, prefix: str = "", is_left: bool = True):
    connector = "└── " if is_left else "┌── "

    if node.node_type == 'const':
        label = format_value(node.value)
    elif node.node_type == 'var':
        label = f"x_{node.value}"
    else:
        label = node.name

    print(prefix + connector + label)

    children = [c for c in [node.left, node.right] if c is not None]
    if children:
        new_prefix = prefix + ("    " if is_left else "│   ")
        for i, child in enumerate(children):
            print_tree(child, prefix=new_prefix, is_left=(i == len(children) - 1))


In [None]:
# Synthetic data (single-feature example)
n_samples = 250
X = np.random.uniform(-2.0, 2.0, size=(n_samples, 1))
noise = np.random.normal(0, 0.2, size=n_samples)

y = 1.5 * X[:, 0]**2 - 0.8 * X[:, 0] + 0.3 + noise

plt.figure(figsize=(6, 4))
plt.scatter(X[:, 0], y, s=15, alpha=0.6)
plt.title('Synthetic data for symbolic regression')
plt.xlabel('x_0')
plt.ylabel('y')
plt.grid(alpha=0.2)
plt.show()


In [None]:
# Candidate symbolic basis expressions (small search space for a basic demo)
candidates = [
    var(0),
    binary(lambda a, b: a * b, var(0), var(0), '*'),
    binary(lambda a, b: a * b, binary(lambda a, b: a * b, var(0), var(0), '*'), var(0), '*'),
    unary(np.sin, var(0), 'sin'),
]

candidate_names = ['x', 'x^2', 'x^3', 'sin(x)']


def fit_linear_on_feature(phi: np.ndarray, y: np.ndarray):
    """Fit y ≈ a*phi + b using least squares."""
    A = np.column_stack([phi, np.ones_like(phi)])
    coef, *_ = np.linalg.lstsq(A, y, rcond=None)
    a, b = coef
    y_hat = a * phi + b
    mse = float(np.mean((y - y_hat)**2))
    return a, b, y_hat, mse


results = []
for name, expr in zip(candidate_names, candidates):
    phi = expr.evaluate(X)
    a, b, y_hat, mse = fit_linear_on_feature(phi, y)
    results.append((name, expr, a, b, y_hat, mse))

results_sorted = sorted(results, key=lambda t: t[-1])
best_name, best_expr, best_a, best_b, best_pred, best_mse = results_sorted[0]

print('Top candidates by MSE:')
for name, _, a, b, _, mse in results_sorted:
    print(f"  {name:<6} | a={format_value(a)}, b={format_value(b)}, mse={format_value(mse)}")


In [None]:
print('Best symbolic feature:', best_name)
print('Feature expression:', best_expr.to_infix())

final_model = binary(
    lambda left, right: left + right,
    binary(lambda a, b: a * b, const(best_a), best_expr, '*'),
    const(best_b),
    '+'
)

print('\nFinal model (infix, 3-decimal formatting):')
print(final_model.to_infix())

print('\nFinal model tree:')
print_tree(final_model)


In [None]:
order = np.argsort(X[:, 0])

plt.figure(figsize=(7, 4))
plt.scatter(X[:, 0], y, s=14, alpha=0.5, label='Observed data')
plt.plot(X[order, 0], best_pred[order], color='crimson', lw=2, label='Best basic SR model')
plt.title(f'Basic symbolic regression result (best={best_name}, MSE={format_value(best_mse)})')
plt.xlabel('x_0')
plt.ylabel('y')
plt.legend()
plt.grid(alpha=0.2)
plt.show()
