#### Analytical Function Estimator: Reinforcement Learning


In [73]:
from packages.func import func
import numpy as np
import random
from math import cos
import matplotlib.pyplot as plt
import sympy as sp

In [74]:

def get_eval_points():
    num_samples = 100
    start = -15
    end = 15
    step = (end - start)/num_samples
    return np.arange(start, end, step)

def generate_sample(f: func) -> (np.array, np.array):
  
    eval_points = get_eval_points()
    # Modeling noise
    mean = 0
    std = np.random.uniform(0.1, 0.3, 1)
    noise = np.random.normal(mean, std, size=len(eval_points))

    X = f.eval(eval_points)
    X = X + noise

    return np.array(X)

In [75]:
# State Representation
def initial_state():
    x = sp.symbols('x')
    return 0*x  # Empty tree to start with

"""
Action Space: All possible operations on the tree that should be made by the
agent. Make sure to implement them in the class. 
"""

# Environment Simulation
def step(x,f, action):
    # Apply action to the state
    if action == "add_const":
        new_state = f + 1
    elif action == "add_x":
        new_state = f + x
    elif action == "del_x":
        new_state = f - x
    elif action == "del_const":
        new_state = f - 1

    return new_state

In [76]:
# TODO: include the fourier analysis as a reward

def reward(x,f, data: np.array):
    
    eval_points = get_eval_points()
    eval_state = [f.subs(x, value) for value in eval_points]
    diff = eval_state - data
    squared_sum = 0
    for d in diff:
        squared_sum += np.square(float(d))
    error = np.sqrt(squared_sum)
    print("Error ", error)
    #complexity_penalty = state.get_size() # Simple complexity measure
    #print("Compl Penalty ", complexity_penalty)
    
    return -0.1*error
    
    # Evaluate the current state and return a reward
    try:
        eval_points = get_eval_points()
        eval_state = [f.subs(x, value) for value in eval_points]
        diff = eval_state - data
        squared_sum = 0
        for d in diff:
            squared_sum += np.square(d)
        error = np.sqrt(squared_sum)
        print("Error ", error)
        #complexity_penalty = state.get_size() # Simple complexity measure
        #print("Compl Penalty ", complexity_penalty)
        
        return -0.1*error
    
    except:
        print("We had a problem fellas")
        return -float('inf')  # Penalize invalid states


In [77]:

x = sp.symbols('x')
f = 3*x + x

actions = ["add_const", "add_x", "del_x", "del_const"]

In [78]:

# Q-Learning
Q = {}
learning_rate = 0.1
discount_factor = 0.5
epsilon = 0.5  # Exploration rate

f = func("3*x + 1")
data = generate_sample(f)
rewarded_values = []

def q_learning_episode():
    
    max_iter = 0
    state = initial_state()
    while max_iter < 1000:
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)  # Explore
        else:
            q_values = [Q.get((state, a), 0) for a in actions]
            max_q = max(q_values)
            action = actions[q_values.index(max_q)]  # Exploit

        new_state = step(x,state, action)
        reward_value = reward(x,new_state, data)
        print(f"We chose {action} with rewarded value {reward_value} and {new_state}")
        rewarded_values.append(reward_value)
        if reward_value > -5:
            print("Good accuracy achieved with fct ", new_state)

            eval_points = get_eval_points()
            X_est = [new_state.subs(x, value) for value in eval_points]
            #print(get_eval_points())
            #plt.plot(get_eval_points(), X_est)
            #plt.plot(get_eval_points(), data)
            plt.plot(rewarded_values)
            plt.show()
            return True

        # Update Q-value
        old_q = Q.get((state, action), 0)
        max_future_q = max([Q.get((new_state, a), 0) for a in actions])
        new_q = (1 - learning_rate) * old_q + learning_rate * (reward_value + discount_factor * max_future_q)
        Q[(state, action)] = new_q

        state = new_state

        max_iter += 1
        print("\n\n")
        

# Run episodes
for _ in range(1):
    if(q_learning_episode()):
        break


Error  259.8116478734768
We chose add_const with rewarded value -25.98116478734768 and 1



Error  260.16669279181417
We chose add_const with rewarded value -26.016669279181418 and 2



Error  260.9048173294792
We chose add_const with rewarded value -26.09048173294792 and 3



Error  262.02278407273604
We chose add_const with rewarded value -26.202278407273607 and 4



Error  263.5157586193404
We chose add_const with rewarded value -26.351575861934037 and 5



Error  265.37741182742957
We chose add_const with rewarded value -26.53774118274296 and 6



Error  267.6000492819908
We chose add_const with rewarded value -26.76000492819908 and 7



Error  270.17476203972603
We chose add_const with rewarded value -27.017476203972606 and 8



Error  273.0915921640967
We chose add_const with rewarded value -27.30915921640967 and 9



Error  276.33970648138876
We chose add_const with rewarded value -27.633970648138877 and 10



Error  279.907572326507
We chose add_const with rewarded value -27.99