In [1]:
import gymnasium as gym
from adaptive_time.features import Fourier_Features
import numpy as np

In [6]:
env = gym.make('CartPole-v1')
tau = 0.00002
env.stepTime(tau)
def generate_trajectory(env):
    observation, _ = env.reset()
    trajectory = []
    terminated = False
    while not terminated:
        action = 0
        observation_, reward, terminated, truncated, info = env.step(action)
        trajectory.append([observation, action, reward, observation_])
        observation = observation_

    return trajectory

trajectory = generate_trajectory(env)

In [7]:
len(trajectory)

12171

In [27]:
phi = Fourier_Features()
phi.init_fourier_features(4,4)
phi.init_state_normalizers(np.array([4.8,2.0,0.418,1]), np.array([-4.8,-2.0,-0.418,-1]))

In [28]:


def ols_monte_carlo(env, phi, weights, targets, features, x0, gamma = 0.999):
    trajectory = generate_trajectory(env)
    N = len(trajectory)
    G = 0
    returns = []
    for t in range(N-1,-1,-1):
        state, _, reward, _ = trajectory[t]
        G = gamma*G + reward
        x = phi.get_fourier_feature(state)
        if np.linalg.norm(x-x0) < 0.00001:
            returns.append(G)
        features += np.outer(x,x)
        targets += G * x
    weights = np.linalg.solve(features, targets)
    return weights, targets, features, np.mean(returns)


def gradient_monte_carlo(env, phi, weights, x0, gamma = 1, alpha = 0.001):
    trajectory = generate_trajectory(env)
    N = len(trajectory)
    G = 0
    returns = []
    for t in range(N-1,-1,-1):
        state, _, reward, _ = trajectory[t]
        G = gamma * G + reward
        x = phi.get_fourier_feature(state)
        if np.linalg.norm(x-x0) < 0.00001:
            returns.append(G)
        weights = weights + alpha * (G - np.inner(x, weights)) * x 
    return weights, np.mean(returns)




num_episodes = 10000
Returns = []
observation, _ = env.reset()
d = len(phi.get_fourier_feature(observation))
features = np.zeros((d,d))
targets = np.zeros(d)
weights = np.zeros(d)
x = phi.get_fourier_feature([0,0,0,0])
for episode in range(num_episodes):
    weights, targets, features, returns = ols_monte_carlo(env, phi, weights, targets, features, x)
    if returns:
        Returns.append(returns)
    if Returns:
        print('episode:',episode, ' empirical returns:' , np.mean(Returns), ' predicted returns:' , np.inner(x,weights))

episode: 0  empirical returns: 12.922285286285284  predicted returns: 12.922285286285303
episode: 1  empirical returns: 12.922285286285284  predicted returns: 12.922285286285291
episode: 2  empirical returns: 12.922285286285282  predicted returns: 12.922285286285254
episode: 3  empirical returns: 12.922285286285284  predicted returns: 12.922285286285842
episode: 4  empirical returns: 12.922285286285284  predicted returns: 12.922285286285295
episode: 5  empirical returns: 12.922285286285286  predicted returns: 12.922285286284762
episode: 6  empirical returns: 12.922285286285286  predicted returns: 12.92228528628531
episode: 7  empirical returns: 12.922285286285284  predicted returns: 12.922285286285272
episode: 8  empirical returns: 12.922285286285284  predicted returns: 12.92228528628513
episode: 9  empirical returns: 12.922285286285284  predicted returns: 12.922285286285282
episode: 10  empirical returns: 12.922285286285284  predicted returns: 12.922285286285131
episode: 11  empirical

KeyboardInterrupt: 

In [42]:
def ols_monte_carlo_control(env, phi, weights, targets, features, x0, gamma = 199/200):
    trajectory = generate_trajectory(env)
    N = len(trajectory)
    G = 0
    returns = []
    for t in range(N-1,-1,-1):
        state, action, reward, _ = trajectory[t]
        G = gamma*G + reward
        x = phi.get_fourier_feature(state)
        features[action] += np.outer(x,x)
        targets[action] += G * x
    for action in range(2)
    weights = np.linalg.solve(features, targets)
    return weights, targets, features, G

In [12]:
trajectory

[[array([1., 0., 0.], dtype=float32),
  array([1.683155], dtype=float32),
  -0.002833010554083572,
  array([0.9999203 , 0.01262333, 0.25247324], dtype=float32)],
 [array([0.9999203 , 0.01262333, 0.25247324], dtype=float32),
  array([-1.1154854], dtype=float32),
  -0.007777938334370556,
  array([0.99984944, 0.01735369, 0.09461792], dtype=float32)],
 [array([0.99984944, 0.01735369, 0.09461792], dtype=float32),
  array([-0.27207616], dtype=float32),
  -0.0012704612358089731,
  array([0.99978584, 0.02069417, 0.06682176], dtype=float32)],
 [array([0.99978584, 0.02069417, 0.06682176], dtype=float32),
  array([-1.2429843], dtype=float32),
  -0.002419834533924224,
  array([ 0.99988   ,  0.01548976, -0.10410526], dtype=float32)],
 [array([ 0.99988   ,  0.01548976, -0.10410526], dtype=float32),
  array([0.7708461], dtype=float32),
  -0.001917946138216929,
  array([0.9998614 , 0.01664656, 0.02313899], dtype=float32)],
 [array([0.9998614 , 0.01664656, 0.02313899], dtype=float32),
  array([1.530998