# Plots of Trade-offs

In this notebook we compare different time discretization methods. First, we collect
a trajectory data from the environment at a fine discretization level (this is also
the discretization level we run the policy at -- right now, anyway). Then we compare:

1. Using uniform discretization at different granularities, e.g. updating with every
    1st, 10th, 100th, ...? interactions.
2. Using the adaptive method with different tolarances.

In order to average out randomness, we'll repeat each setting 3 times for now.

In [1]:
import gymnasium as gym
from adaptive_time.features import Fourier_Features
import numpy as np
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import random
from joblib import Parallel, delayed

import adaptive_time.utils
from adaptive_time import environments
from adaptive_time import mc2
from adaptive_time import samplers

seed = 13

In [2]:
gym.register(
    id="CartPole-OURS-v0",
    entry_point="adaptive_time.environments.cartpole:CartPoleEnv",
    vector_entry_point="adaptive_time.environments.cartpole:CartPoleVectorEnv",
    max_episode_steps=500,
    reward_threshold=475.0,
)

def reset_randomness(seed, env):
    random.seed(seed)
    np.random.seed(seed)
    # env.seed(seed)
    env.action_space.seed(seed)

In [3]:
# Sample usage of the environment.
print(
    "We run the same environment and simple policy twice,\n"
    "with different time discretizations. The policy we use\n"
    "will always go left, so the time discretization does not\n"
    "make a difference to the behaviour, and the total return\n"
    "will be the same.")
print()

policy = lambda obs: 0

env = gym.make('CartPole-OURS-v0')
tau = 0.02
env.stepTime(tau)

reset_randomness(seed, env)
traj, early_term = environments.generate_trajectory(env, seed, policy)
total_return_1 = sum(ts[2] for ts in traj)
print("Total undiscounted return: ", total_return_1)

env = gym.make('CartPole-OURS-v0')
tau = 0.002
env.stepTime(tau)

reset_randomness(seed, env)
traj, early_term = environments.generate_trajectory(env, seed, policy)
total_return_2 = sum(ts[2] for ts in traj)
print("Total undiscounted return: ", total_return_2)

#np.testing.assert_almost_equal(total_return_1, total_return_2, decimal=0)

print()
print(
    "We can expect some difference because we may get an extra\n"
    "timesteps in the more fine-grained discretization, but the\n"
    "difference should be smallish.")

We run the same environment and simple policy twice,
with different time discretizations. The policy we use
will always go left, so the time discretization does not
make a difference to the behaviour, and the total return
will be the same.

Total undiscounted return:  10.589912009424973
Total undiscounted return:  100.17508472458734

We can expect some difference because we may get an extra
timesteps in the more fine-grained discretization, but the
difference should be smallish.


  logger.warn(


**NOTE** you must adjust the discount factor if changing time-scales!

In [4]:
phi = Fourier_Features()
phi.init_fourier_features(4,3)
x_thres = 4.8
theta_thres = 0.418
phi.init_state_normalizers(
    np.array([x_thres,2.0,theta_thres,1]),
    np.array([-x_thres,-2.0,-theta_thres,-1]))
phi.num_parameters

256

In [5]:

def run_experiment(
        seed, env, sampler, epsilon, num_episodes, gamma, tqdm=None, print_trajectory=False):
    """Returns the number of episodes it took to solve the environment."""
    if tqdm is None:
        tqdm_use = lambda x: x
    total_pivots = 0
    total_interactions = 0

    # We record:
    returns_per_episode_q = np.zeros((2, num_episodes))
    average_returns_q = np.zeros((2, num_episodes))  # the cumulative average of the above
    predicted_returns_q = np.zeros((2, num_episodes))

    reset_randomness(seed, env)

    observation, _ = env.reset(seed=seed)
    d = len(phi.get_fourier_feature(observation))
    assert d == phi.num_parameters
    features = np.identity(2 * d)   # An estimate of A = xx^T
    targets = np.zeros(2 * d)  # An estimate of b = xG
    weights = np.zeros(2 * d)   # The weights that approximate A^{-1} b

    x_0 = phi.get_fourier_feature([0,0,0,0])  # the initial state
    x_sa0 = mc2.phi_sa(x_0, 0)
    x_sa1 = mc2.phi_sa(x_0, 1)

    for episode in range(num_episodes):
        def policy(state):
            if random.random() < epsilon:
                return env.action_space.sample()
            # Otherwise calculate the best action.
            x = phi.get_fourier_feature(state)
            qs = np.zeros(2)
            for action in [0, 1]:
                x_sa = mc2.phi_sa(x, action)
                qs[action] = np.inner(x_sa.flatten(), weights)
            # adaptive_time.utils.softmax(qs, 1)
            return adaptive_time.utils.argmax(qs)

        trajectory, early_term = environments.generate_trajectory(env, policy=policy, max_steps=100_000)
        if early_term:
            #print("episode:", episode)
            #print("Did not drop it for a long time, returning!")
            return episode, total_pivots, total_interactions

        total_interactions += len(trajectory)
        print_trajectory = False
        if print_trajectory:
            print("trajectory-len: ", len(trajectory), "; trajectory:")
            for idx, (o, a, r, o_) in enumerate(trajectory):
                # * ignore reward, as it is always the same here.
                # * o_ is the same as the next o.
                print(f"* {idx:4d}: o: {o}\n\t --> action: {a}")

        weights, targets, features, cur_avr_returns, num_pivots = mc2.ols_monte_carlo(
            trajectory, sampler, tqdm_use, phi, weights, targets, features, x_0, gamma)
        total_pivots += num_pivots
        
        # Store the empirical and predicted returns. For any episode, we may
        # or may not have empirical returns for both actions. When we don't have an
        # estimate, `nan` is returned.
        returns_per_episode_q[:, episode] = cur_avr_returns
        average_returns_q[:, episode] = np.nanmean(returns_per_episode_q[:, :episode+1], axis=1)

        predicted_returns_q[0, episode] = np.inner(x_sa0.flatten(), weights)
        predicted_returns_q[1, episode] = np.inner(x_sa1.flatten(), weights)
        #print(
        #    'episode:', episode,
        #    ' empirical returns:' , returns_per_episode_q[:, episode],
        #    ' predicted returns:' , predicted_returns_q[:, episode])
    
    return -1, total_pivots, total_interactions

In [10]:
num_episodes = 100
epsilon = 0.0

num_runs = 45

tau = 0.002
env.stepTime(tau)

# tqdm_use = tqdm
# tqdm_use = lambda x: x

# sampler = samplers.AdaptiveQuadratureSampler2(tolerance=0.1)
# sampler = samplers.AdaptiveQuadratureSampler2(tolerance=0.0)

samplers_tried = dict(
    q_20=samplers.AdaptiveQuadratureSampler2(tolerance=20)
    q0_10=samplers.AdaptiveQuadratureSampler2(tolerance=10),
    q0_5=samplers.AdaptiveQuadratureSampler2(tolerance=5),
    q0_1=samplers.AdaptiveQuadratureSampler2(tolerance=1),
    u5=samplers.UniformSampler2(5),
    u10=samplers.UniformSampler2(10),
    u20=samplers.UniformSampler2(20),
    u40=samplers.UniformSampler2(40),
)

results = {}
for name, sampler in tqdm(samplers_tried.items()):
    print(name, sampler)
    #results[name] = []
    results[name] = Parallel(n_jobs = num_runs)(delayed(run_experiment)(seed+run, env, sampler, epsilon, num_episodes, gamma=0.999, tqdm=None) for run in range(num_runs))

'''
for name, sampler in samplers_tried.items():
    results[name] = []
    for run in range(num_runs):
        print()
        print(f"=============      Running experiment with sampler {name}, run={run}     =============")
        results[name].append(
            run_experiment(seed+run, env, sampler, epsilon, num_episodes, gamma=0.999, tqdm=None))
'''
print()
print("DONE!")

  logger.warn(


  0%|          | 0/8 [00:00<?, ?it/s]

q0_1 <adaptive_time.samplers.AdaptiveQuadratureSampler2 object at 0x7fad1fb535e0>
q0_05 <adaptive_time.samplers.AdaptiveQuadratureSampler2 object at 0x7fad1fb95c00>
q0_005 <adaptive_time.samplers.AdaptiveQuadratureSampler2 object at 0x7fad1fb95690>
q0_0 <adaptive_time.samplers.AdaptiveQuadratureSampler2 object at 0x7fad1fb960b0>
u1 <adaptive_time.samplers.UniformSampler2 object at 0x7fad1fb96080>
u5 <adaptive_time.samplers.UniformSampler2 object at 0x7fad1fb96530>
u10 <adaptive_time.samplers.UniformSampler2 object at 0x7fad1fb96050>
u20 <adaptive_time.samplers.UniformSampler2 object at 0x7fad1fb95ed0>

DONE!


In [12]:

print()
print("Results, a list of num_episodes, and a list of num_pivots for the different seeds:")
for name, sub_results in results.items():
    num_episodes, num_pivots, num_interactions = zip(*sub_results)
    print(f"* {name}")
    if -1 in num_episodes:
        num_eps_stats = f"?? +- ??"
    else:
        mean_num_episodes = np.mean(num_episodes)
        std_err_episodes = np.std(num_episodes) / np.sqrt(len(num_episodes))
        num_eps_stats = f"{np.mean(num_episodes):.2f} +- {std_err_episodes:.2f}"

    std_err_pivots = np.std(num_pivots) / np.sqrt(len(num_pivots))
    std_err_num_interactions = np.std(num_interactions) / np.sqrt(len(num_interactions))
    print(f"    * num_episodes: {num_eps_stats}                full list: {num_episodes}")
    print(f"    * num_pivots:   {np.mean(num_pivots):.2f} +- {std_err_pivots:.2f}           full list: {num_pivots}")
    print(f"    * num_interactions:   {np.mean(num_interactions):.2f} +- {std_err_num_interactions:.2f}"
          f"           full list: {num_interactions}")




Results, a list of num_episodes, and a list of num_pivots for the different seeds:
* q0_1
    * num_episodes: ?? +- ??                full list: (1, -1, 8, -1, 6, -1, 1, 8, 4, -1, 1, 1, 1, 9, 5, -1, 1, -1, -1, 2, 7, 15, -1, -1, 1, 2, -1, -1, 4, 9, -1, 9, -1, 1, -1, -1, 1, 1, 2, 26, -1, 4, -1, 10, 18)
    * num_pivots:   1678.31 +- 1084.09           full list: (16, 1748, 129, 1207, 91, 1321, 16, 127, 54, 1205, 16, 17, 22, 141, 70, 1752, 16, 1208, 2695, 29, 92, 237, 1589, 1204, 20, 73, 1207, 1448, 107, 157, 1409, 175, 1205, 17, 1207, 1204, 25, 16, 33, 319, 1713, 55, 49698, 129, 305)
    * num_interactions:   15544.40 +- 4629.65           full list: (239, 71282, 1629, 12517, 1561, 13876, 227, 1749, 634, 12491, 212, 320, 423, 1732, 942, 25345, 255, 12559, 77138, 417, 1171, 2731, 104161, 12494, 401, 2352, 12517, 71608, 2630, 2416, 16762, 2194, 12482, 260, 12628, 12411, 755, 323, 477, 3480, 28442, 752, 154960, 1428, 4115)
* q0_05
    * num_episodes: ?? +- ??                full list: (4, -1

In [None]:
print(epsilon)

0.1
