In [8]:
import gymnasium as gym
from adaptive_time.features import Fourier_Features
import numpy as np
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import random
import adaptive_time.utils

seed = 13

In [9]:
gym.register(
    id="CartPole-OURS-v0",
    entry_point="adaptive_time.environments.cartpole:CartPoleEnv",
    vector_entry_point="adaptive_time.environments.cartpole:CartPoleVectorEnv",
    max_episode_steps=500,
    reward_threshold=475.0,
)

def reset_randomness(seed, env):
    random.seed(seed)
    np.random.seed(seed)
    # env.seed(seed)
    env.action_space.seed(seed)

In [10]:
env = gym.make('CartPole-OURS-v0')
tau = 0.02
env.stepTime(tau)

def generate_trajectory(env, policy=None):
    observation, _ = env.reset()
    trajectory = []
    terminated = False
    steps = 0
    if policy is None:
        policy = lambda x: env.action_space.sample()
    while not terminated:
        steps += 1
        action = policy(observation)
        observation_, reward, terminated, truncated, info = env.step(action)
        trajectory.append([observation, action, reward, observation_])
        observation = observation_

        if steps % 5000 == 0:
            print('Good trajectory!', steps)


    return trajectory

reset_randomness(seed, env)
trajectory = generate_trajectory(env)

In [11]:
def phi_sa(phi_x, a, prev_phi_sa=None):
    """Form the (state, action) feature, potentially reusing memory.
    
    - phi_x: the state feature
    - a: the action
    - prev_phi_sa: the previous state,action feature, which can be
      reused to avoid memory allocation.

    Returns the feature as a (2, d) array. Use a flat copy.
    """
    if prev_phi_sa is not None:
        prev_phi_sa.fill(0)
        phi_sa = prev_phi_sa
    else:
        phi_sa = np.zeros((2, phi_x.size))
    phi_sa[a] = phi_x
    return phi_sa


def ols_monte_carlo_q(
        env, phi, weights, targets, features, x0, policy=None, print_trajectory=False, gamma = 0.999):
    trajectory = generate_trajectory(env, policy=policy)
    if print_trajectory:
        print("trajectory-len: ", len(trajectory), "; trajectory:")
        for idx, (o, a, r, o_) in enumerate(trajectory):
            # * ignore reward, as it is always the same here.
            # * o_ is the same as the next o.
            print(f"* {idx:4d}: o: {o}\n\t --> action: {a}")
    N = len(trajectory)
    G = 0
    x_sa = np.zeros((2, phi.num_parameters))
    returns_a0 = []  # from x0 (the initial state), action 0
    returns_a1 = []  # from x0 (the initial state), action 1
    for t in tqdm(range(N-1,-1,-1)):
        state, action, reward, _ = trajectory[t]
        G = gamma*G + reward
        x = phi.get_fourier_feature(state)
        # Record empirical returns.
        if np.linalg.norm(x-x0) < 0.00001:
            if action == 0:
                returns_a0.append(G)
                returns_a1.append(-0)
            elif action == 1:
                returns_a1.append(G)
                returns_a0.append(-0)

        x_sa = phi_sa(x, action, x_sa)
        x_sa_flat = x_sa.flatten()

        features += np.outer(x_sa_flat, x_sa_flat)
        targets += G * x_sa_flat
    try:
        weights = np.linalg.solve(features, targets)
    except np.linalg.LinAlgError:
        print("Singular matrix in OLS. Using previous weights.")
    return weights, targets, features, (np.mean(returns_a0), np.mean(returns_a1))

In [12]:
print(len(trajectory))
print(trajectory[-1])

16
[array([ 0.19256485,  1.381852  , -0.20756072, -1.        ], dtype=float32), 0, 0.008930331461485835, array([ 0.22020188,  1.1900171 , -0.22756071, -0.7790094 ], dtype=float32)]


In [13]:
phi = Fourier_Features()
phi.init_fourier_features(4,4)
x_thres = 4.8
theta_thres = 0.418
phi.init_state_normalizers(np.array([x_thres,2.0,theta_thres,1]), np.array([-x_thres,-2.0,-theta_thres,-1]))
phi.num_parameters

625

In [14]:
num_episodes = 500
epsilon = 0.1

tau = 0.002
env.stepTime(tau)

# We record:
returns_per_episode_q = np.zeros((2, num_episodes))
average_returns_q = np.zeros((2, num_episodes))  # the cumulative average of the above
predicted_returns_q = np.zeros((2, num_episodes))

reset_randomness(seed, env)

observation, _ = env.reset()
d = len(phi.get_fourier_feature(observation))
assert d == phi.num_parameters
features = np.identity(2 * d)   # An estimate of A = xx^T
targets = np.zeros(2 * d)  # An estimate of b = xG
weights = np.zeros(2 * d)   # The weights that approximate A^{-1} b

x_0 = phi.get_fourier_feature([0,0,0,0])  # the initial state
x_sa0 = phi_sa(x_0, 0)
x_sa1 = phi_sa(x_0, 1)

for episode in range(num_episodes):
    def policy(state):
        if random.random() < epsilon:
            return env.action_space.sample()
        # Otherwise calculate the best action.
        x = phi.get_fourier_feature(state)
        qs = np.zeros(2)
        for action in [0, 1]:
            x_sa = phi_sa(x, action)
            qs[action] = np.inner(x_sa.flatten(), weights)
        # adaptive_time.utils.softmax(qs, 1)
        return adaptive_time.utils.argmax(qs)

    weights, targets, features, cur_avr_returns = ols_monte_carlo_q(
        env, phi, weights, targets, features, x_0, policy=policy, print_trajectory=False)
    
    # Store the empirical and predicted returns. For any episode, we may
    # or may not have empirical returns for both actions. When we don't have an
    # estimate, `nan` is returned.
    returns_per_episode_q[:, episode] = cur_avr_returns
    average_returns_q[:, episode] = np.nanmean(returns_per_episode_q[:, :episode+1], axis=1)

    predicted_returns_q[0, episode] = np.inner(x_sa0.flatten(), weights)
    predicted_returns_q[1, episode] = np.inner(x_sa1.flatten(), weights)
    print(
        'episode:', episode,
        ' empirical returns:' , returns_per_episode_q[:, episode],
        ' predicted returns:' , predicted_returns_q[:, episode])

episode: 0  empirical returns: [215.00707851   0.        ]  predicted returns: [216.30639074 220.64720704]
episode: 1  empirical returns: [ 0.         60.09221578]  predicted returns: [180.4547449  104.04889367]
episode: 2  empirical returns: [248.76047141   0.        ]  predicted returns: [206.91793325 117.49730633]
episode: 3  empirical returns: [256.08153032   0.        ]  predicted returns: [220.46624006 125.4473577 ]
Good trajectory! 5000
Good trajectory! 10000
Good trajectory! 15000
Good trajectory! 20000
Good trajectory! 25000
Good trajectory! 30000
Good trajectory! 35000
Good trajectory! 40000
Good trajectory! 45000
Good trajectory! 50000
Good trajectory! 55000
Good trajectory! 60000
Good trajectory! 65000
Good trajectory! 70000
Good trajectory! 75000
Good trajectory! 80000
Good trajectory! 85000
Good trajectory! 90000
Good trajectory! 95000
Good trajectory! 100000
Good trajectory! 105000
Good trajectory! 110000
Good trajectory! 115000
Good trajectory! 120000
Good trajectory! 1

In [50]:
12 * 2 * np.pi / 360

0.20943951023931953