# Value Estimation -- quadrature vs uniform

In [33]:
import math
from importlib import reload

import matplotlib.pyplot as plt
import pickle
import numpy as np

from tqdm.notebook import tqdm

from adaptive_time import plot_utils
from adaptive_time import utils
from adaptive_time import run_lib
from adaptive_time.environments import cartpole2
from adaptive_time import value_est
from adaptive_time.value_est import approx_integrators

from pprint import pprint

approx_integrators = reload(approx_integrators)
run_lib = reload(run_lib)
cartpole2 = reload(cartpole2)
value_est = reload(value_est)
plot_utils = reload(plot_utils)
utils = reload(utils)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [7]:
utils.set_directory_in_project()

Changed working directory to /Users/szepi1991/Code/adaptive_time


'/Users/szepi1991/Code/adaptive_time'

## Load a goodish policy, generate data to process

Recall, we want:

1. The policy to stay up for ~10k steps, while we interact for 20k steps.
2. To generate trajectories from 100 different initial states.

We just need to store the fine grained rewards for each of these trajectories, all processing will happen on this after. 

In [8]:
GENERATE_NEW_DATA = False
SAVE_TRAJECTORIES = False
load_data_from = "many_good_trajs.pkl"


In [9]:
if GENERATE_NEW_DATA:
    seed = 13
    STEPS_MAX = 20_000
    STEPS_BREAK = 9_000
    NUM_TRAJS = 100

    from adaptive_time import mc2
    import adaptive_time.utils
    import gymnasium as gym
    import random

    env = gym.make('CartPole-OURS-v2', discrete_reward=True)
    _NUM_ACTIONS = 2

    phi = run_lib.make_features()

    weights_good_policy = np.load("cartpole_weights_20240227-102913_ret92516.44719752521.npy")

    # implement epsilon-greedy action sampling. 
    def policy(state, num_step, weights, epsilon):
        """Returns the action to take, and maybe the prob of all actions"""
        if num_step >= STEPS_BREAK:
            # If we are the the failing case, make this much more likely.
            epsilon = 0.06
        if random.random() < epsilon:
        # if random.random() < epsilon:
            action = env.action_space.sample()
            return action

        # Otherwise calculate the best action.
        x = phi.get_fourier_feature(state)
        qs = np.zeros(_NUM_ACTIONS)
        for action in range(_NUM_ACTIONS):
            x_sa = mc2.phi_sa(x, action)
            qs[action] = np.inner(x_sa.flatten(), weights)
        # adaptive_time.utils.softmax(qs, 1)
        
        return adaptive_time.utils.argmax(qs)

    run_lib.reset_randomness(seed, env)

    def _random_start_state(num):
        rand = np.random.standard_normal((num, 4))
        rand *= np.array([[0.01, 0.01, 0.001, 0.001]])
        return rand

    start_states = _random_start_state(NUM_TRAJS)
    print("shape", start_states.shape)
    print("max", np.max(start_states, axis=0))

    total_rewards = []
    reward_sequences = []
    traj_lengths = []
    for idx in range(NUM_TRAJS):
        start_state = tuple(start_states[idx])
        # Tuple[float, float, float, float]
        trajectory, early_term = value_est.generate_trajectory(
                env, start_state=start_state,
                policy=lambda st, sn: policy(st, sn, weights_good_policy, 0.0),
                termination_prob=0.0, max_steps=STEPS_MAX)

        traj_lengths.append(len(trajectory))
        rewards = [r for _, _, r, _ in trajectory]
        reward_sequences.append(rewards)
        total_rewards.append(sum(rewards))

    total_rewards = np.array(total_rewards)
    reward_sequences = np.array(reward_sequences)
    traj_lengths = np.array(traj_lengths)

    if SAVE_TRAJECTORIES:
        with open(load_data_from, "wb") as f:
            pickle.dump((total_rewards, reward_sequences, traj_lengths), f)
        print("Saved data to", load_data_from)

else:
    with open(load_data_from, "rb") as f:
        data = pickle.load(f)
    total_rewards, reward_sequences, traj_lengths = data

    print("Loaded data from", load_data_from)


print("total_rewards for each traj:", total_rewards)
print("all rewards shape:", reward_sequences.shape)
num_trajs = len(total_rewards)

Loaded data from many_good_trajs.pkl
total_rewards for each traj: [ 9564. 11828.  9723. 12029.  9441.  9283.  9515. 11781. 11517.  9112.
  9193. 10553.  9231. 13828.  9831. 11665.  9726.  9243.  9342.  9321.
  9844. 10532.  9465. 10929. 11433. 10190. 10488.  9660.  9078. 11085.
 10102.  9302.  9712. 10843.  9751. 10436. 10490. 10579.  9970. 10050.
 10328.  9921.  9765. 10185.  9596.  9825.  9045. 11476.  9417. 10730.
  9381. 11138.  9417.  9532.  9187. 11204.  9341.  9449. 10203. 10600.
 11353. 10876. 13146. 10028. 11178.  9332. 10300.  9229.  9746.  9073.
 10724.  9575. 10833.  9437.  9299. 12054.  9100.  9292. 10665.  9482.
 10504. 10365. 11089.  9422.  9607.  9611. 12038.  9754.  9168.  9108.
  9368. 10059.  9300. 11889. 10222. 10455.  9102.  9229.  9114.  9912.]
all rewards shape: (100, 20001)


## Data setup 2: the weights across initial states


In [10]:
seed = 43
run_lib.reset_randomness(seed, env=None)

weights = np.random.random((num_trajs,))
weights /= np.sum(weights)

print("weights:", weights)
print("sum:", np.sum(weights))

weights: [2.22313387e-03 1.17686459e-02 2.57743764e-03 4.64877622e-03
 6.32112168e-03 1.66006245e-02 1.28704819e-02 1.04565693e-02
 5.60617607e-04 1.41778005e-02 7.63139430e-03 1.54974997e-02
 4.91603453e-03 1.09915524e-03 1.67457582e-02 4.27081742e-03
 7.82538048e-03 6.10775211e-03 1.48130970e-03 1.62931506e-02
 1.64035677e-02 1.87710668e-02 7.44641866e-03 1.84430305e-02
 8.61313497e-03 1.29407081e-02 1.59410148e-03 1.73341250e-02
 5.75815194e-03 5.06836672e-03 9.91153555e-05 1.04959929e-02
 9.18967777e-03 1.22962862e-02 1.89012814e-02 1.75575730e-02
 1.75863696e-02 1.01492162e-02 2.00989893e-03 3.49570970e-03
 1.84150535e-02 7.95993291e-03 1.67142332e-02 1.29880988e-02
 1.21496699e-02 5.32446532e-03 1.73273368e-02 3.99764415e-03
 7.81409222e-03 1.91982355e-02 1.42160354e-02 8.59966823e-03
 1.08333752e-02 7.94645562e-03 1.40471750e-02 7.71345410e-03
 1.29488334e-02 1.36168184e-02 1.17781783e-02 1.04347784e-02
 3.98200225e-03 3.84828380e-03 1.53756089e-02 5.60993493e-03
 1.26747962e-02

## Learning

### Ground truth value

In [11]:
true_value = total_rewards @ weights
print("true value:", true_value)

true value: 10076.089908727237


### Learn from samples; with diff samplers

In [38]:
samplers_tried = dict(
    q100=approx_integrators.AdaptiveQuadratureIntegrator(tolerance=100),
    q10=approx_integrators.AdaptiveQuadratureIntegrator(tolerance=10),
    q1=approx_integrators.AdaptiveQuadratureIntegrator(tolerance=1),
    u1=approx_integrators.UniformlySpacedIntegrator(1),
    u10=approx_integrators.UniformlySpacedIntegrator(10),
    u100=approx_integrators.UniformlySpacedIntegrator(100),
    u1000=approx_integrators.UniformlySpacedIntegrator(1000),
    u10000=approx_integrators.UniformlySpacedIntegrator(10000),
)


Since everything other than the start state is deterministic, we can just calculate the approximate integrals for each trajectory with each integrator and store these.

In [39]:
approx_integrals = {}
num_pivots = {}

for sampler_name, sampler in tqdm(samplers_tried.items()):
    print("sampler_name:", sampler_name)
    approx_integrals[sampler_name] = []
    num_pivots[sampler_name] = []
    for idx, reward_seq in enumerate(reward_sequences[:1]):
        integral, all_pivots = sampler.integrate(reward_seq)
        approx_integrals[sampler_name].append(integral)
        num_pivots[sampler_name].append(len(all_pivots))


  0%|          | 0/8 [00:00<?, ?it/s]

sampler_name: q100
sampler_name: q10
sampler_name: q1
sampler_name: u1
sampler_name: u10
sampler_name: u100
sampler_name: u1000
sampler_name: u10000


In [40]:
approx_integrals

{'q100': [9559.0],
 'q10': [9559.0],
 'q1': [9559.0],
 'u1': [9564.0],
 'u10': [1911.0],
 'u100': [191.0],
 'u1000': [19.0],
 'u10000': [1.0]}

In [41]:
num_pivots

{'q100': [53],
 'q10': [53],
 'q1': [53],
 'u1': [20001],
 'u10': [4001],
 'u100': [401],
 'u1000': [41],
 'u10000': [5]}

In [31]:
assert False

AssertionError: 

In [15]:
approx_integrals

{'q100': 9838.0,
 'q10': 9838.0,
 'q1': 9838.0,
 'u1': 9912.0,
 'u10': 1977.0,
 'u100': 197.0,
 'u1000': 19.0,
 'u10000': 1.0}

In [16]:
num_pivots

{'q100': 48,
 'q10': 48,
 'q1': 48,
 'u1': 20001,
 'u10': 4001,
 'u100': 401,
 'u1000': 41,
 'u10000': 5}

Now we just keep sampling from the initial states and observe empirical 