# Value Estimation -- quadrature vs uniform

In [31]:
import math
from importlib import reload

import matplotlib.pyplot as plt
import pickle
import numpy as np

from adaptive_time import plot_utils
from adaptive_time import utils
from adaptive_time import run_lib
from adaptive_time.environments import cartpole2
from adaptive_time import value_est

from pprint import pprint

cartpole2 = reload(cartpole2)
value_est = reload(value_est)
plot_utils = reload(plot_utils)
utils = reload(utils)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [32]:
utils.set_directory_in_project()

Changed working directory to /Users/szepi1991/Code/adaptive_time


'/Users/szepi1991/Code/adaptive_time'

## Load a goodish policy, generate data to process

Recall, we want:

1. The policy to stay up for ~10k steps, while we interact for 20k steps.
2. To generate trajectories from 100 different initial states.

We just need to store the fine grained rewards for each of these trajectories, all processing will happen on this after. 

In [75]:
GENERATE_NEW_DATA = True
SAVE_TRAJECTORIES = True
load_data_from = "many_good_trajs.pkl"


In [78]:
if GENERATE_NEW_DATA:
    seed = 13
    STEPS_MAX = 20_000
    STEPS_BREAK = 9_000
    NUM_TRAJS = 100

    from adaptive_time import mc2
    import adaptive_time.utils
    import gymnasium as gym
    import random

    env = gym.make('CartPole-OURS-v2', discrete_reward=True)
    _NUM_ACTIONS = 2

    phi = run_lib.make_features()

    weights_good_policy = np.load("cartpole_weights_20240227-102913_ret92516.44719752521.npy")

    # implement epsilon-greedy action sampling. 
    def policy(state, num_step, weights, epsilon):
        """Returns the action to take, and maybe the prob of all actions"""
        if num_step >= STEPS_BREAK:
            # If we are the the failing case, make this much more likely.
            epsilon = 0.06
        if random.random() < epsilon:
        # if random.random() < epsilon:
            action = env.action_space.sample()
            return action

        # Otherwise calculate the best action.
        x = phi.get_fourier_feature(state)
        qs = np.zeros(_NUM_ACTIONS)
        for action in range(_NUM_ACTIONS):
            x_sa = mc2.phi_sa(x, action)
            qs[action] = np.inner(x_sa.flatten(), weights)
        # adaptive_time.utils.softmax(qs, 1)
        
        return adaptive_time.utils.argmax(qs)

    run_lib.reset_randomness(seed, env)

    def _random_start_state(num):
        rand = np.random.standard_normal((num, 4))
        rand *= np.array([[0.01, 0.01, 0.001, 0.001]])
        return rand

    start_states = _random_start_state(NUM_TRAJS)
    print("shape", start_states.shape)
    print("max", np.max(start_states, axis=0))

    total_rewards = []
    reward_sequences = []
    traj_lengths = []
    for idx in range(NUM_TRAJS):
        start_state = tuple(start_states[idx])
        # Tuple[float, float, float, float]
        trajectory, early_term = value_est.generate_trajectory(
                env, start_state=start_state,
                policy=lambda st, sn: policy(st, sn, weights_good_policy, 0.0),
                termination_prob=0.0, max_steps=STEPS_MAX)

        traj_lengths.append(len(trajectory))
        rewards = [r for _, _, r, _ in trajectory]
        reward_sequences.append(rewards)
        total_rewards.append(sum(rewards))

    total_rewards = np.array(total_rewards)
    reward_sequences = np.array(reward_sequences)
    traj_lengths = np.array(traj_lengths)

    if SAVE_TRAJECTORIES:
        with open(load_data_from, "wb") as f:
            pickle.dump((total_rewards, reward_sequences, traj_lengths), f)
        print("Saved data to", load_data_from)

else:
    np.load
    with open(load_data_from, "rb") as f:
        data = pickle.load(f)
    total_rewards, reward_sequences, traj_lengths = data

    print("Loaded data from", load_data_from)


shape (100, 4)
max [0.01849613 0.03401106 0.00259208 0.00238675]
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000
Did 20_000 steps! 20000

In [81]:
total_rewards

array([ 9564., 11828.,  9723., 12029.,  9441.,  9283.,  9515., 11781.,
       11517.,  9112.,  9193., 10553.,  9231., 13828.,  9831., 11665.,
        9726.,  9243.,  9342.,  9321.,  9844., 10532.,  9465., 10929.,
       11433., 10190., 10488.,  9660.,  9078., 11085., 10102.,  9302.,
        9712., 10843.,  9751., 10436., 10490., 10579.,  9970., 10050.,
       10328.,  9921.,  9765., 10185.,  9596.,  9825.,  9045., 11476.,
        9417., 10730.,  9381., 11138.,  9417.,  9532.,  9187., 11204.,
        9341.,  9449., 10203., 10600., 11353., 10876., 13146., 10028.,
       11178.,  9332., 10300.,  9229.,  9746.,  9073., 10724.,  9575.,
       10833.,  9437.,  9299., 12054.,  9100.,  9292., 10665.,  9482.,
       10504., 10365., 11089.,  9422.,  9607.,  9611., 12038.,  9754.,
        9168.,  9108.,  9368., 10059.,  9300., 11889., 10222., 10455.,
        9102.,  9229.,  9114.,  9912.])

## Data setup 2: the weights across initial states


In [None]:
seed = 43
run_lib.reset_randomness(seed, env)