# Create the ground truth for policy evaluation

We want a good and a bad policy, and we'll evaluate the mixture of these policies.

At the start of the episode we pick one of the policies with prob `p` (and
the other with prob `1-p`), and stick to it for episode. Repeat and rinse.

We will evaluate different value estimation methods on this mixed policy. We will
see how well they approximate the true value (in the start state) of this mixed policy.
But for this, we need to know the true value of the mixed policy; thankfully this is
easy to calculate in our setting.

Each of these policies is deterministic, and the environment is deterministic. Therefore
we can easily find, by running the policies once, $V^{\pi_g}$, the value of the good policy in the start state, and $V^{\pi_b}$, the value of the bad policy in the start state.

The value of the mixed policy will be $p ^{\pi_g} + (1-p) ^{\pi_b}$.


In [1]:
import matplotlib.pyplot as plt
import pickle
import numpy as np
import random
import gymnasium as gym

import adaptive_time.utils
from adaptive_time import run_lib
from adaptive_time import environments
from adaptive_time import mc2

from pprint import pprint

In [5]:
seed = 13

run_lib.register_gym_envs()
env = gym.make('CartPole-OURS-v0')
env.stepTime(0.02)

run_lib.reset_randomness(seed, env)
phi = run_lib.make_features()

weights_good_policy = np.load(
    '/Users/alexayoub/adaptive_time/code/adaptive_time/notebooks/cartpole_weights_20240227-102913_ret92516.44719752521.npy')
weights_bad_policy = np.random.uniform(size = len(weights_good_policy))


pi_good = []
pi_bad = []

def policy(state, weights, pi_for_storing):
    if random.random() < 0.02:
    # if random.random() < 0.01:
        a = env.action_space.sample()
        pi_for_storing.append(a)
        return a
    # Otherwise calculate the best action.
    x = phi.get_fourier_feature(state)
    qs = np.zeros(2)
    for action in [0, 1]:
        x_sa = mc2.phi_sa(x, action)
        qs[action] = np.inner(x_sa.flatten(), weights)
    # adaptive_time.utils.softmax(qs, 1)
    a = adaptive_time.utils.argmax(qs)
    pi_for_storing.append(a)
    return a

policy_good = lambda s: policy(state=s, weights=weights_good_policy, pi_for_storing=pi_good)
policy_bad = lambda s: policy(state=s, weights=weights_bad_policy, pi_for_storing=pi_bad)

print('running good policy')
run_lib.reset_randomness(seed, env)
traj_good, early_term = environments.generate_trajectory(env, seed, policy_good)

print('running bad policy')
run_lib.reset_randomness(seed, env)
traj_bad, early_term = environments.generate_trajectory(env, seed, policy_bad)

np.save('policy_to_eval_good.npy', pi_good)
np.save('policy_to_eval_bad.npy', pi_bad)

def get_returns(trajectory, x0, gamma = 0.99999):
    returns = []
    G = 0.0
    for t in range(len(trajectory)-1,-1,-1):
        traj = trajectory[t]
        s, _, r, _ = traj
        x = phi.get_fourier_feature(s)
        G = gamma * G + r
        if np.linalg.norm(x - x0) <= 0.0001:
            returns.append(G)
    return returns

x0 = phi.get_fourier_feature([0,0,0,0])
returns_good = get_returns(traj_good, x0)
returns_bad = get_returns(traj_bad, x0)

print(returns_good, returns_bad)

running good policy
running bad policy
[13811.823415783701] [26.83463996869155]


trajectory_lengths: 14874 37
