# Plots of Trade-offs

In this notebook we compare different time discretization methods. First, we collect
a trajectory data from the environment at a fine discretization level (this is also
the discretization level we run the policy at -- right now, anyway). Then we compare:

1. Using uniform discretization at different granularities, e.g. updating with every
    1st, 10th, 100th, ...? interactions.
2. Using the adaptive method with different tolarances.

In order to average out randomness, we'll repeat each setting 3 times for now.

In [1]:
import gymnasium as gym
from adaptive_time.features import Fourier_Features
import numpy as np
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import random

import adaptive_time.utils
from adaptive_time import environments
from adaptive_time import mc2
from adaptive_time import samplers

seed = 13

In [2]:
gym.register(
    id="CartPole-OURS-v0",
    entry_point="adaptive_time.environments.cartpole:CartPoleEnv",
    vector_entry_point="adaptive_time.environments.cartpole:CartPoleVectorEnv",
    max_episode_steps=500,
    reward_threshold=475.0,
)

def reset_randomness(seed, env):
    random.seed(seed)
    np.random.seed(seed)
    # env.seed(seed)
    env.action_space.seed(seed)

In [3]:
# Sample usage of the environment.
print(
    "We run the same environment and simple policy twice,\n"
    "with different time discretizations. The policy we use\n"
    "will always go left, so the time discretization does not\n"
    "make a difference to the behaviour, and the total return\n"
    "will be the same.")
print()

policy = lambda obs: 0

env = gym.make('CartPole-OURS-v0')
tau = 0.02
env.stepTime(tau)

reset_randomness(seed, env)
traj = environments.generate_trajectory(env, seed, policy)
total_return_1 = sum(ts[2] for ts in traj)
print("Total undiscounted return: ", total_return_1)

env = gym.make('CartPole-OURS-v0')
tau = 0.002
env.stepTime(tau)

reset_randomness(seed, env)
traj = environments.generate_trajectory(env, seed, policy)
total_return_2 = sum(ts[2] for ts in traj)
print("Total undiscounted return: ", total_return_2)

np.testing.assert_almost_equal(total_return_1, total_return_2, decimal=0)

print()
print(
    "We can expect some difference because we may get an extra\n"
    "timesteps in the more fine-grained discretization, but the\n"
    "difference should be smallish.")

We run the same environment and simple policy twice,
with different time discretizations. The policy we use
will always go left, so the time discretization does not
make a difference to the behaviour, and the total return
will be the same.

Total undiscounted return:  10.589912009424973
Total undiscounted return:  10.017508472458736

We can expect some difference because we may get an extra
timesteps in the more fine-grained discretization, but the
difference should be smallish.


  logger.warn(


**NOTE** you must adjust the discount factor if changing time-scales!

In [5]:
phi = Fourier_Features()
phi.init_fourier_features(4,4)
x_thres = 4.8
theta_thres = 0.418
phi.init_state_normalizers(np.array([x_thres,2.0,theta_thres,1]), np.array([-x_thres,-2.0,-theta_thres,-1]))
phi.num_parameters

625

In [8]:

def run_experiment(
        seed, env, sampler, epsilon, num_episodes, gamma, tqdm=None, print_trajectory=False):
    """Returns the number of episodes it took to solve the environment."""
    if tqdm is None:
        tqdm_use = lambda x: x
    total_pivots = 0

    # We record:
    returns_per_episode_q = np.zeros((2, num_episodes))
    average_returns_q = np.zeros((2, num_episodes))  # the cumulative average of the above
    predicted_returns_q = np.zeros((2, num_episodes))

    reset_randomness(seed, env)

    observation, _ = env.reset(seed=seed)
    d = len(phi.get_fourier_feature(observation))
    assert d == phi.num_parameters
    features = np.identity(2 * d)   # An estimate of A = xx^T
    targets = np.zeros(2 * d)  # An estimate of b = xG
    weights = np.zeros(2 * d)   # The weights that approximate A^{-1} b

    x_0 = phi.get_fourier_feature([0,0,0,0])  # the initial state
    x_sa0 = mc2.phi_sa(x_0, 0)
    x_sa1 = mc2.phi_sa(x_0, 1)

    for episode in range(num_episodes):
        def policy(state):
            if random.random() < epsilon:
                return env.action_space.sample()
            # Otherwise calculate the best action.
            x = phi.get_fourier_feature(state)
            qs = np.zeros(2)
            for action in [0, 1]:
                x_sa = mc2.phi_sa(x, action)
                qs[action] = np.inner(x_sa.flatten(), weights)
            # adaptive_time.utils.softmax(qs, 1)
            return adaptive_time.utils.argmax(qs)

        trajectory = environments.generate_trajectory(env, policy=policy, max_steps=100_000)
        if trajectory is None:
            print("episode:", episode)
            print("Did not drop it for a long time, returning!")
            return episode, total_pivots

        if print_trajectory:
            print("trajectory-len: ", len(trajectory), "; trajectory:")
            for idx, (o, a, r, o_) in enumerate(trajectory):
                # * ignore reward, as it is always the same here.
                # * o_ is the same as the next o.
                print(f"* {idx:4d}: o: {o}\n\t --> action: {a}")

        weights, targets, features, cur_avr_returns, num_pivots = mc2.ols_monte_carlo(
            trajectory, sampler, tqdm_use, phi, weights, targets, features, x_0, gamma)
        total_pivots += num_pivots
        
        # Store the empirical and predicted returns. For any episode, we may
        # or may not have empirical returns for both actions. When we don't have an
        # estimate, `nan` is returned.
        returns_per_episode_q[:, episode] = cur_avr_returns
        average_returns_q[:, episode] = np.nanmean(returns_per_episode_q[:, :episode+1], axis=1)

        predicted_returns_q[0, episode] = np.inner(x_sa0.flatten(), weights)
        predicted_returns_q[1, episode] = np.inner(x_sa1.flatten(), weights)
        print(
            'episode:', episode,
            ' empirical returns:' , returns_per_episode_q[:, episode],
            ' predicted returns:' , predicted_returns_q[:, episode])
    
    return -1, total_pivots

In [23]:
num_episodes = 100
epsilon = 0.1

num_runs = 5

tau = 0.002
env.stepTime(tau)

# tqdm_use = tqdm
# tqdm_use = lambda x: x

# sampler = samplers.AdaptiveQuadratureSampler2(tolerance=0.1)
# sampler = samplers.AdaptiveQuadratureSampler2(tolerance=0.0)

samplers_tried = dict(
    q0_3=samplers.AdaptiveQuadratureSampler2(tolerance=0.3),
    q0_1=samplers.AdaptiveQuadratureSampler2(tolerance=0.1),
    q0_03=samplers.AdaptiveQuadratureSampler2(tolerance=0.03),
    q0_0=samplers.AdaptiveQuadratureSampler2(tolerance=0.0),
    u1=samplers.UniformSampler2(1),
    u5=samplers.UniformSampler2(5),
    u10=samplers.UniformSampler2(10),
    u20=samplers.UniformSampler2(20),
)

results = {}
for name, sampler in samplers_tried.items():
    results[name] = []
    for run in range(num_runs):
        print()
        print(f"=============      Running experiment with sampler {name}, run={run}     =============")
        results[name].append(
            run_experiment(seed+run, env, sampler, epsilon, num_episodes, gamma=0.999, tqdm=None))

print()
print("DONE!")

  logger.warn(



Using 29/548 samples.
episode: 0  empirical returns: [39.07399847  0.        ]  predicted returns: [38.64580775 12.42227583]
Did 20_000 steps! 20000
Did 20_000 steps! 40000
Did 20_000 steps! 60000
Did 20_000 steps! 80000
Did 20_000 steps! 100000
Max steps reached! 100001
episode: 1
Did not drop it for a long time, returning!

Using 21/311 samples.
episode: 0  empirical returns: [ 0.         24.42192221]  predicted returns: [12.43981449 24.19385996]
Did 20_000 steps! 20000
Did 20_000 steps! 40000
Did 20_000 steps! 60000
Did 20_000 steps! 80000
Did 20_000 steps! 100000
Max steps reached! 100001
episode: 1
Did not drop it for a long time, returning!

Using 23/345 samples.
episode: 0  empirical returns: [26.27823599  0.        ]  predicted returns: [26.14393409 10.8751014 ]
Using 14/123 samples.
episode: 1  empirical returns: [9.54183256 0.        ]  predicted returns: [17.86718439 11.11295213]
Using 16/129 samples.
episode: 2  empirical returns: [9.80062088 0.        ]  predicted returns

In [30]:
unfinished_episodes_num = 500

print()
print("Results, a list of num_episodes, and a list of num_pivots for the different seeds:")
for name, sub_results in results.items():
    num_episodes, num_pivots = zip(*sub_results)
    print(f"* {name}")
    if -1 in num_episodes:
        num_eps_stats = f"?? +- ??"
    else:
        mean_num_episodes = np.mean(num_episodes)
        std_err_episodes = np.std(num_episodes) / np.sqrt(len(num_episodes))
        num_eps_stats = f"{np.mean(num_episodes):.2f} +- {std_err_episodes:.2f}"

    mean_num_episodes = np.mean(num_episodes)
    srd_err_pivots = np.std(num_pivots) / np.sqrt(len(num_pivots))
    print(f"    * num_episodes: {num_eps_stats}                deets: {num_episodes}")
    print(f"    * num_pivots:   {np.mean(num_pivots):.2f} +- {srd_err_pivots:.2f}           deets: {num_pivots}")




Results, a list of num_episodes, and a list of num_pivots for the different seeds:
* q0_3
    * num_episodes: 8.60 +- 6.36            deets: (1, 1, 37, 3, 1)
    * num_pivots:   311.80 +- 251.87           deets: (29, 21, 1438, 50, 21)
* q0_1
    * num_episodes: ?? +- ??            deets: (7, -1, -1, 60, 1)
    * num_pivots:   1896.40 +- 665.87           deets: (168, 3305, 2648, 3332, 29)
* q0_03
    * num_episodes: 9.40 +- 3.69            deets: (24, 2, 11, 9, 1)
    * num_pivots:   662.00 +- 308.60           deets: (1922, 78, 817, 450, 43)
* q0_0
    * num_episodes: 7.20 +- 3.87            deets: (24, 2, 7, 2, 1)
    * num_pivots:   4851.80 +- 2567.67           deets: (15160, 645, 7155, 999, 300)
* u1
    * num_episodes: 11.40 +- 4.51            deets: (24, 23, 7, 2, 1)
    * num_pivots:   8300.40 +- 3209.27           deets: (15174, 17872, 7157, 999, 300)
* u5
    * num_episodes: 4.20 +- 1.00            deets: (4, 4, 8, 4, 1)
    * num_pivots:   917.80 +- 373.43           deets: (445