In [1]:
import gymnasium as gym
from adaptive_time.features import Fourier_Features
import numpy as np
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import random
import adaptive_time.utils

seed = 13

In [2]:
gym.register(
    id="CartPole-OURS-v0",
    entry_point="adaptive_time.environments.cartpole:CartPoleEnv",
    vector_entry_point="adaptive_time.environments.cartpole:CartPoleVectorEnv",
    max_episode_steps=500,
    reward_threshold=475.0,
)

def reset_randomness(seed, env):
    random.seed(seed)
    np.random.seed(seed)
    # env.seed(seed)
    env.action_space.seed(seed)

In [3]:
env = gym.make('CartPole-OURS-v0')
tau = 0.02
env.stepTime(tau)

def generate_trajectory(env, policy=None):
    observation, _ = env.reset()
    trajectory = []
    terminated = False
    steps = 0
    if policy is None:
        policy = lambda x: env.action_space.sample()
    while not terminated:
        steps += 1
        action = policy(observation)
        observation_, reward, terminated, truncated, info = env.step(action)
        trajectory.append([observation, action, reward, observation_])
        observation = observation_

        if steps % 5000 == 0:
            print('Good trajectory!', steps)


    return trajectory

reset_randomness(seed, env)
trajectory = generate_trajectory(env)

In [12]:
def integrate(xs, tol, idxes):
    N = len(xs[0])
    if N > 2:
        Q = N * (xs[0,0] + xs[0,-1]) / 2
        idxes[int(xs[1,0])] = 1
        idxes[int(xs[1,-1])] = 1
    else:
        idxes[int(xs[1,0])] = 1
        idxes[int(xs[1,-1])] = 1
        return sum(xs[0])
    truth = sum(xs[0])
    if np.abs(Q - truth) > tol:
        c = int(np.floor(N / 2))
        Q = integrate(xs[:,:c], tol / 2, idxes) + integrate(xs[:,c:], tol / 2, idxes)
    return Q
    


def get_return(pivots, rewards):
    sums = 0
    for i in range(0,len(pivots)-1):
        idx1 = pivots[i]
        idx2 = pivots[i+1]
        sums += len(rewards[idx1:idx2+1]) * (rewards[idx1])
    return sums
    


def phi_sa(phi_x, a, prev_phi_sa=None):
    """Form the (state, action) feature, potentially reusing memory.
    
    - phi_x: the state feature
    - a: the action
    - prev_phi_sa: the previous state,action feature, which can be
      reused to avoid memory allocation.

    Returns the feature as a (2, d) array. Use a flat copy.
    """
    if prev_phi_sa is not None:
        prev_phi_sa.fill(0)
        phi_sa = prev_phi_sa
    else:
        phi_sa = np.zeros((2, phi_x.size))
    phi_sa[a] = phi_x
    return phi_sa


def ols_monte_carlo_quad(
        env, phi, weights, targets, features, x0, policy=None, print_trajectory=False, gamma = 0.999):
    trajectory = generate_trajectory(env, policy=policy)
    if print_trajectory:
        print("trajectory-len: ", len(trajectory), "; trajectory:")
        for idx, (o, a, r, o_) in enumerate(trajectory):
            # * ignore reward, as it is always the same here.
            # * o_ is the same as the next o.
            print(f"* {idx:4d}: o: {o}\n\t --> action: {a}")
    N = len(trajectory)
    rewards = np.zeros((2,N))
    for idx, traj in enumerate(trajectory):
        rewards[0, idx] = traj[2]
        rewards[1, idx] = idx
    idxes = {}
    _ = integrate(rewards, 0.1, idxes)
    pivots = list(idxes.keys())
    print(len(idxes), N)
    G = 0
    x_sa = np.zeros((2, phi.num_parameters))
    returns_a0 = []  # from x0 (the initial state), action 0
    returns_a1 = []  # from x0 (the initial state), action 1
    for t in tqdm(range(N-1,-1,-1)):
        if t in pivots:
            state, action, reward, _ = trajectory[t]
            G = gamma * G + reward
            x = phi.get_fourier_feature(state)
            # Record empirical returns.
            if np.linalg.norm(x-x0) < 0.00001:
                if action == 0:
                    returns_a0.append(G)
                    returns_a1.append(-0)
                elif action == 1:
                    returns_a1.append(G)
                    returns_a0.append(-0)

            x_sa = phi_sa(x, action, x_sa)
            x_sa_flat = x_sa.flatten()

            features += np.outer(x_sa_flat, x_sa_flat)
            targets += G * x_sa_flat
        try:
            weights = np.linalg.solve(features, targets)
        except np.linalg.LinAlgError:
            print("Singular matrix in OLS. Using previous weights.")
    else:
        G = gamma * G + reward
    return weights, targets, features, (np.mean(returns_a0), np.mean(returns_a1))


def ols_monte_carlo_q(
        env, phi, weights, targets, features, x0, policy=None, print_trajectory=False, gamma = 0.999):
    trajectory = generate_trajectory(env, policy=policy)
    if print_trajectory:
        print("trajectory-len: ", len(trajectory), "; trajectory:")
        for idx, (o, a, r, o_) in enumerate(trajectory):
            # * ignore reward, as it is always the same here.
            # * o_ is the same as the next o.
            print(f"* {idx:4d}: o: {o}\n\t --> action: {a}")
    N = len(trajectory)
    G = 0
    x_sa = np.zeros((2, phi.num_parameters))
    returns_a0 = []  # from x0 (the initial state), action 0
    returns_a1 = []  # from x0 (the initial state), action 1
    for t in tqdm(range(N-1,-1,-1)):
        state, action, reward, _ = trajectory[t]
        G = gamma*G + reward
        x = phi.get_fourier_feature(state)
        # Record empirical returns.
        if np.linalg.norm(x-x0) < 0.00001:
            if action == 0:
                returns_a0.append(G)
                returns_a1.append(-0)
            elif action == 1:
                returns_a1.append(G)
                returns_a0.append(-0)

        x_sa = phi_sa(x, action, x_sa)
        x_sa_flat = x_sa.flatten()

        features += np.outer(x_sa_flat, x_sa_flat)
        targets += G * x_sa_flat
    try:
        weights = np.linalg.solve(features, targets)
    except np.linalg.LinAlgError:
        print("Singular matrix in OLS. Using previous weights.")
    return weights, targets, features, (np.mean(returns_a0), np.mean(returns_a1))

In [13]:
print(len(trajectory))
print(trajectory[0])

16
[array([0., 0., 0., 0.], dtype=float32), 1, 1.0, array([ 0.        ,  0.19512194,  0.        , -0.29268292], dtype=float32)]


In [14]:
phi = Fourier_Features()
phi.init_fourier_features(4,4)
x_thres = 4.8
theta_thres = 0.418
phi.init_state_normalizers(np.array([x_thres,2.0,theta_thres,1]), np.array([-x_thres,-2.0,-theta_thres,-1]))
phi.num_parameters

625

In [16]:
num_episodes = 500
epsilon = 0.1

tau = 0.002
env.stepTime(tau)

# We record:
returns_per_episode_q = np.zeros((2, num_episodes))
average_returns_q = np.zeros((2, num_episodes))  # the cumulative average of the above
predicted_returns_q = np.zeros((2, num_episodes))

reset_randomness(seed, env)

observation, _ = env.reset()
d = len(phi.get_fourier_feature(observation))
assert d == phi.num_parameters
features = np.identity(2 * d)   # An estimate of A = xx^T
targets = np.zeros(2 * d)  # An estimate of b = xG
weights = np.zeros(2 * d)   # The weights that approximate A^{-1} b

x_0 = phi.get_fourier_feature([0,0,0,0])  # the initial state
x_sa0 = phi_sa(x_0, 0)
x_sa1 = phi_sa(x_0, 1)

for episode in range(num_episodes):
    def policy(state):
        if random.random() < epsilon:
            return env.action_space.sample()
        # Otherwise calculate the best action.
        x = phi.get_fourier_feature(state)
        qs = np.zeros(2)
        for action in [0, 1]:
            x_sa = phi_sa(x, action)
            qs[action] = np.inner(x_sa.flatten(), weights)
        # adaptive_time.utils.softmax(qs, 1)
        return adaptive_time.utils.argmax(qs)

    weights, targets, features, cur_avr_returns = ols_monte_carlo_quad(
        env, phi, weights, targets, features, x_0, policy=policy, print_trajectory=False)
    
    # Store the empirical and predicted returns. For any episode, we may
    # or may not have empirical returns for both actions. When we don't have an
    # estimate, `nan` is returned.
    returns_per_episode_q[:, episode] = cur_avr_returns
    average_returns_q[:, episode] = np.nanmean(returns_per_episode_q[:, :episode+1], axis=1)

    predicted_returns_q[0, episode] = np.inner(x_sa0.flatten(), weights)
    predicted_returns_q[1, episode] = np.inner(x_sa1.flatten(), weights)
    print(
        'episode:', episode,
        ' empirical returns:' , returns_per_episode_q[:, episode],
        ' predicted returns:' , predicted_returns_q[:, episode])

92 400


  0%|          | 0/400 [00:00<?, ?it/s]

episode: 0  empirical returns: [70.91348277  0.        ]  predicted returns: [70.26398947 59.70663005]
40 123


  0%|          | 0/123 [00:00<?, ?it/s]

episode: 1  empirical returns: [27.32949402  0.        ]  predicted returns: [52.61432151 59.70663005]
37 144


  0%|          | 0/144 [00:00<?, ?it/s]

episode: 2  empirical returns: [ 0.         22.88289545]  predicted returns: [52.65025234 25.65344876]
93 345


  0%|          | 0/345 [00:00<?, ?it/s]

episode: 3  empirical returns: [71.62932932  0.        ]  predicted returns: [58.96130128 28.32850007]
363 1048


  0%|          | 0/1048 [00:00<?, ?it/s]

episode: 4  empirical returns: [235.76468763   0.        ]  predicted returns: [103.90599528  43.26457479]
52 178


  0%|          | 0/178 [00:00<?, ?it/s]

episode: 5  empirical returns: [37.39840699  0.        ]  predicted returns: [87.45499411 19.87490025]
48 189


  0%|          | 0/189 [00:00<?, ?it/s]

episode: 6  empirical returns: [33.94225455  0.        ]  predicted returns: [76.96004832  7.52358356]
121 428


  0%|          | 0/428 [00:00<?, ?it/s]

episode: 7  empirical returns: [97.02129755  0.        ]  predicted returns: [ 84.0366012  -27.11737255]
281 1030


  0%|          | 0/1030 [00:00<?, ?it/s]

episode: 8  empirical returns: [208.71095121   0.        ]  predicted returns: [100.31733236  17.60979711]
55 257


  0%|          | 0/257 [00:00<?, ?it/s]

episode: 9  empirical returns: [38.87036898  0.        ]  predicted returns: [94.6025822  14.46917879]
281 956


  0%|          | 0/956 [00:00<?, ?it/s]

episode: 10  empirical returns: [205.04284347   0.        ]  predicted returns: [113.26439191  93.39009755]
129 1026


  0%|          | 0/1026 [00:00<?, ?it/s]

In [87]:
def foo2(xs,tol,level):
    c=int(np.floor(len(xs)/2))
    #print(xs, c)
    f = lambda xs: len(xs)*(xs[0]+xs[-1])/2 if len(xs) else 0
    if abs(f(xs) - (r:=f(xs[:c]) + f(xs[c:]))) < tol: return 1, r, [level+c]
    else: 
        x, a, cs_a = foo2(xs[:c],tol/2, level)
        y, b, cs_b = foo2(xs[c:],tol/2, level+c)
        return x+y+1, a+b, cs_a + cs_b + [level+c]

In [88]:
tau = 0.0002
env.stepTime(tau)
reset_randomness(seed, env)
trajectory = generate_trajectory(env)

Good trajectory! 5000


In [165]:
rewards = []
for traj in trajectory:
    rewards.append(traj[2])

In [223]:
#xs = [1,2]
rewards = [1,2,3,4,5,6,7]
x,y,z= foo2(rewards,0.01,0)
z.append(0)
z.sort()
if len(rewards)-1 not in z:
    z.append(len(rewards)-1)

In [224]:
sums = 0
for i in range(0,len(z)-1):
    idx1 = z[i]
    idx2 = z[i+1]
    sums += len(rewards[idx1:idx2+1]) * (rewards[idx1])

In [225]:
y - sum(rewards)

0.0

In [226]:
z

[0, 3, 6]

In [227]:
y

28.0

In [228]:
sum(rewards)

28

In [229]:
sums - y

-8.0

In [230]:
sums

20

In [222]:
z

[0, 2, 3]

In [231]:
pivot = [0, 50, 60, 80]

for i in range(100):
    if i in pivot:
        print(i)

0
50
60
80


In [261]:
trajectory = generate_trajectory(env, policy=policy)

Good trajectory! 5000
Good trajectory! 10000
Good trajectory! 15000
Good trajectory! 20000
Good trajectory! 25000


In [279]:
rewards = []
for traj in trajectory:
    rewards.append(traj[2])


def foo2(xs,tol,level):
    c=int(np.floor(len(xs)/2))
    #print(xs, c)
    f = lambda xs: len(xs)*(xs[0]+xs[-1])/2 if len(xs) else 0
    if abs(f(xs) - (r:=f(xs[:c]) + f(xs[c:]))) < tol: return 0, r, []
    else: 
        x, a, cs_a = foo2(xs[:c],tol/2, level)
        y, b, cs_b = foo2(xs[c:],tol/2, level+c)
        return x+y+1, a+b, cs_a + cs_b + [level+c]

In [291]:
x,y,z = foo2(rewards, 0.5, 0)

In [292]:
x

5147

In [293]:
y-sum(rewards)

7.76025212035529

In [294]:
z.sort()

In [295]:
any(z.count(x) > 1 for x in z)

False

In [296]:
if 0 not in z:
    z.append(0)
if len(rewards)-1 not in z:
    z.append(len(rewards)-1)

In [297]:
z.sort()

In [298]:
z

[0,
 1,
 3,
 4,
 6,
 7,
 9,
 10,
 12,
 13,
 15,
 16,
 18,
 19,
 21,
 22,
 24,
 25,
 27,
 28,
 30,
 31,
 33,
 34,
 36,
 37,
 39,
 40,
 42,
 43,
 45,
 47,
 49,
 50,
 52,
 53,
 55,
 56,
 58,
 59,
 61,
 62,
 64,
 65,
 67,
 68,
 70,
 71,
 73,
 74,
 76,
 77,
 79,
 80,
 82,
 83,
 85,
 86,
 88,
 89,
 91,
 92,
 94,
 96,
 98,
 99,
 101,
 102,
 104,
 105,
 107,
 108,
 110,
 111,
 113,
 114,
 116,
 117,
 119,
 120,
 122,
 123,
 125,
 126,
 128,
 129,
 131,
 132,
 134,
 135,
 137,
 138,
 140,
 141,
 143,
 145,
 147,
 148,
 150,
 151,
 153,
 154,
 156,
 157,
 159,
 160,
 162,
 163,
 165,
 166,
 168,
 169,
 171,
 172,
 174,
 175,
 177,
 178,
 180,
 181,
 183,
 184,
 186,
 187,
 189,
 190,
 192,
 194,
 196,
 197,
 199,
 200,
 202,
 203,
 205,
 206,
 208,
 209,
 211,
 212,
 214,
 215,
 217,
 218,
 220,
 221,
 223,
 224,
 226,
 229,
 230,
 232,
 233,
 235,
 236,
 238,
 239,
 241,
 243,
 245,
 246,
 248,
 249,
 251,
 252,
 254,
 255,
 257,
 258,
 260,
 261,
 263,
 264,
 266,
 267,
 269,
 270,
 272,
 273,