# The purpose of this notebook is for experimenting with code snippets

Monte Carlo in MC

In [1]:
from adaptive_time.environment import MountainCar, Corridor
from adaptive_time.features import Fourier_Features
import numpy as np
from adaptive_time.utils import argmax

In [2]:
phi = Fourier_Features()
phi.init_fourier_features(2,2)
phi.init_state_normalizers(np.array([0.6,0.07]), np.array([-1.2,-0.07]))

phi.get_fourier_feature([-0.5,0.0])

array([ 1.00000000e+00,  6.12323400e-17, -1.00000000e+00,  3.42020143e-01,
       -9.39692621e-01, -3.42020143e-01, -7.66044443e-01, -6.42787610e-01,
        7.66044443e-01])

In [3]:
def generate_trajectory_control(env, weights, features, episode, num_actions = 3):
    s = env.reset(episode)
    s_len = len(s)
    rows = int(env.horizon_sec)
    columns = int(s_len + 1 + 1 + s_len)
    trajectory = np.zeros((rows,columns))
    done = False
    t = 0
    while not done:

        #state_feature = phi.get_fourier_feature(s)
        #action_value = np.zeros(num_actions) #For mountain car
        #for action in range(num_actions):
        #    action_value[action] = np.inner(state_feature, weights[action])
        #a = argmax(action_value)
        a = np.random.choice(3)
        r, s_, _, done = env.step(a)
        trajectory[t,0:s_len], trajectory[t, s_len], trajectory[t,s_len + 1], trajectory[t,s_len+2:] = s, a, r, s_
        s = s_
        t = t + 1

    return trajectory



def gradient_monte_carlo(env, phi, weights, num_actions, episode, alpha = 0.01):
    s = env.reset()
    s_len = len(s)
    rewards = []
    trajectory = generate_trajectory(env, weights, episode, num_actions)
    N = len(trajectory)
    G = 0
    for t in range(N):
        state, _, reward, _ = trajectory[t,0:s_len], trajectory[t, s_len], trajectory[t,s_len + 1], trajectory[t,s_len+2:]
        G = np.sum(trajectory[t:,-2])
        x = phi.get_fourier_feature(state)

        weights = weights + alpha * (G - np.inner(x, weights)) * x 

        rewards.append(reward)
    return weights, sum(rewards)


def ols_monte_carlo(env, phi, weights, targets, features, episode):
    s = env.reset()
    s_len = len(s)
    rewards = []
    trajectory = generate_trajectory(env, weights, episode)
    N = len(trajectory)
    G = 0
    for t in range(N):
        state, _, reward, _ = trajectory[t,0:s_len], trajectory[t, s_len], trajectory[t,s_len + 1], trajectory[t,s_len+2:]
        G = np.sum(trajectory[t:,s_len + 1])
        x = phi.get_fourier_feature(state)
        features += np.outer(x,x)
        targets += G * x
        rewards.append(reward)
    weights = np.linalg.solve(features, targets)
    return weights, targets, features, sum(rewards)






num_actions = 3
num_episodes = 10000
env = MountainCar(es=0)
phi = Fourier_Features()
phi.init_fourier_features(2,2)
phi.init_state_normalizers(np.array([0.6,0.07]), np.array([-1.2,-0.07]))
d = len(phi.get_fourier_feature([0.0,0.0]))
Returns = []
features = np.zeros((d,d))
targets = np.zeros(d)
weights = np.zeros(d)
x = phi.get_fourier_feature([-0.5,0.0])
for episode in range(num_episodes):
    weights, targets, features, returns = ols_monte_carlo(env, phi, weights, targets, features, episode)
    Returns.append(returns)
    print(episode, np.mean(Returns), np.inner(x,weights))



    

Mountain Car
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
6 0.0 0.0
7 0.0 0.0
8 0.0 0.0
9 0.0 0.0
10 0.0 0.0
11 0.0 0.0
12 0.0 0.0
13 0.0 0.0
14 0.0 0.0
15 0.0 0.0
16 0.0 0.0
17 0.0 0.0
18 0.0 0.0
19 0.0 0.0
20 0.0 0.0
21 0.0 0.0
22 0.0 0.0
23 0.0 0.0
24 0.0 0.0
25 0.0 0.0
26 0.0 0.0
27 0.0 0.0
28 0.0 0.0
29 0.0 0.0
30 0.0 0.0
31 0.0 0.0
32 0.0 0.0
33 0.0 0.0
34 0.0 0.0
35 0.0 0.0
36 0.0 0.0
37 0.0 0.0
38 0.0 0.0
39 0.0 0.0
40 0.0 0.0
41 0.0 0.0
42 0.0 0.0
43 0.0 0.0
44 0.0 0.0
45 0.0 0.0
46 0.0 0.0
47 0.0 0.0
48 0.0 0.0
49 0.0 0.0
50 0.0 0.0
51 0.0 0.0
52 0.0 0.0
53 0.0 0.0
54 0.0 0.0
55 0.0 0.0
56 0.0 0.0
57 0.0 0.0
58 0.0 0.0
59 0.0 0.0
60 0.0 0.0
61 0.0 0.0
62 0.0 0.0
63 0.0 0.0
64 0.0 0.0
65 0.0 0.0
66 0.0 0.0
67 0.0 0.0
68 0.0 0.0
69 0.0 0.0
70 0.0 0.0
71 0.0 0.0
72 0.0 0.0
73 0.0 0.0
74 0.0 0.0
75 0.0 0.0
76 0.0 0.0
77 0.0 0.0
78 0.0 0.0
79 0.0 0.0
80 0.0 0.0
81 0.0 0.0
82 0.0 0.0
83 0.0 0.0
84 0.0 0.0
85 0.0 0.0
86 0.0 0.0
87 0.0 0.0
88 0.0 0.0
89 0.0 0.0
90 0.0 

KeyboardInterrupt: 

# Quadrature

In [49]:
def generate_trajectory_control(env, weights, episode, ensemble = 5, num_actions = 3):
    s = env.reset(episode)
    s_len = len(s)
    rows = int(env.horizon_sec)
    columns = int(s_len + 1 + 1 + s_len)
    trajectory = np.zeros((rows,columns))
    done = False
    t = 0
    while not done:
        state_feature = phi.get_fourier_feature(s)
        action_value = np.zeros((ensemble, num_actions))
        for M in range(ensemble):
            for action in range(num_actions):
                action_value[M,action] = np.inner(state_feature, weights[M,action]) 
        a = np.argmax(np.max(action_value, axis=0))
        r, s_, _, done = env.step(a)
        trajectory[t,0:s_len], trajectory[t, s_len], trajectory[t,s_len + 1], trajectory[t,s_len+2:] = s, a, r, s_
        s = s_
        t = t + 1
    return trajectory

def ols_monte_carlo_control(env, phi, weights, targets, features, episode, ensemble = 5, d = 9):
    s = env.reset()
    s_len = len(s)
    rewards = []
    trajectory = generate_trajectory_control(env, weights, episode, ensemble)
    N = len(trajectory)
    G = 0
    for t in range(N):
        state, action, reward, _ = trajectory[t,0:s_len], int(trajectory[t, s_len]), trajectory[t,s_len + 1], trajectory[t,s_len+2:]
        G = np.sum(trajectory[t:,s_len + 1])
        x = phi.get_fourier_feature(state)
        features[action] = features[action] + np.outer(x,x)
        targets[action] += G * x
        rewards.append(reward)
    for M in range(ensemble):
        for action in range(num_actions):
            weights[M,action] = 100*np.random.multivariate_normal(np.linalg.solve(features[action], targets[action]), np.linalg.inv(features[action]))
    return weights, targets, features, sum(rewards)


num_actions = 3
num_episodes = 10000
ensemble = 5
env = MountainCar(es=0)
phi = Fourier_Features()
phi.init_fourier_features(2,2)
phi.init_state_normalizers(np.array([0.6,0.07]), np.array([-1.2,-0.07]))
d = len(phi.get_fourier_feature([0.0,0.0]))
Returns = []
features = np.zeros((num_actions,d,d))
for i in range(num_actions):
    features[i] = np.identity(d)
targets = np.zeros((num_actions,d))
weights = np.random.normal(size=(ensemble, num_actions, d))
x = phi.get_fourier_feature([-0.5,0.0])
for episode in range(num_episodes):
    weights, targets, features, returns = ols_monte_carlo_control(env, phi, weights, targets, features, episode, ensemble, d)
    print(episode, returns)

Mountain Car
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
12 0.0
13 0.0
14 0.0
15 0.0
16 0.0
17 0.0
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 0.0
24 0.0
25 0.0
26 0.0
27 0.0
28 0.0
29 0.0
30 0.0
31 0.0
32 0.0
33 0.0
34 0.0
35 0.0
36 0.0
37 0.0
38 0.0
39 0.0
40 0.0
41 0.0
42 0.0
43 0.0
44 0.0
45 0.0
46 0.0
47 0.0
48 0.0
49 0.0
50 0.0
51 0.0
52 0.0
53 0.0
54 0.0
55 0.0
56 0.0
57 0.0
58 0.0
59 0.0
60 0.0
61 0.0
62 0.0
63 0.0
64 0.0
65 0.0
66 0.0
67 0.0
68 0.0
69 0.0
70 0.0
71 0.0
72 0.0
73 0.0
74 0.0
75 0.0
76 0.0
77 0.0
78 0.0
79 0.0
80 0.0
81 0.0
82 0.0
83 0.0
84 0.0
85 0.0
86 0.0
87 0.0
88 0.0
89 0.0
90 0.0
91 0.0
92 0.0
93 0.0
94 0.0
95 0.0
96 0.0
97 0.0
98 0.0
99 0.0
100 0.0
101 0.0
102 0.0
103 0.0
104 0.0
105 0.0
106 0.0
107 0.0
108 0.0
109 0.0
110 0.0
111 0.0
112 0.0
113 0.0
114 0.0
115 0.0
116 0.0
117 0.0
118 0.0
119 0.0
120 0.0
121 0.0
122 0.0
123 0.0
124 0.0
125 0.0
126 0.0
127 0.0
128 0.0
129 0.0
130 0.0
131 0.0
132 0.0
133 0.0
134 0.0
135 0.0
136 0.0
1

In [48]:
weights[5,2]

IndexError: index 5 is out of bounds for axis 0 with size 5

In [42]:
traj = generate_trajectory(env, np.zeros(d), episode, num_actions = 3)
traj[0]

array([-0.1423003 ,  0.03741954,  1.        ,  0.        , -0.10715639,
        0.03514391])

In [None]:
def trapeziod_rule(a,b):
    return (b-a) * (function(a) + function(b)) / 2.0

def integral_rule(a,b):
    return trapeziod_rule(a,b)

def function(x):
    return x**5

def adaptive_quadrature(a0, b0, tol0):
    sums = 0.0
    n = 1
    a = np.zeros(100000)
    b = np.zeros(100000)
    tol = np.zeros(100000)
    app = np.zeros(100000)
    iters = 0
    
    a[1] = a0
    b[1] = b0
    tol[1] = tol0
    app[1] = integral_rule(a0,b0)
    
    while n > 0:
        iters += 1
        c = (a[n] + b[n]) / 2
        oldapp = app[n]
        app[n] = integral_rule(a[n], c)
        app[n+1] = integral_rule(c, b[n])
        
        if np.abs(oldapp - (app[n]+app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1] #success
            n = n - 1 #done with interval
            
        else:    #divide into two intervals
            b[n+1] = b[n] #setup new intervals
            b[n] = c  #setup new intervals
            a[n+1] = c #setup new intervals
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
    return sums,iters
        
        

In [None]:
b = 15
truth = b**5/5

quad,iters = adaptive_quadrature(0, b, 0.00005)

h = iters
x = np.linspace(0, b, num=h)
y = function(x)
trap = np.trapz(y,x)

In [None]:
print(np.abs(truth - quad))
print(np.abs(truth - trap))

print(iters - h)

1746562.5000270708
1746562.5000363498
0


In [16]:
import math
def adaptive_sum(traj, tol0=10**(-2)):
    steps = {}
    N = len(traj)
    app = np.zeros(N+1000)
    tol = np.zeros(N+1000)
    N_begin = np.ones(N+1000,dtype=np.int8) * int(-1)
    N_end = np.ones(N+1000,dtype=np.int8) * int(-1)

    sums = 0.0


    steps[N_begin[0]] = 1
    steps[N_end[0]] = 1

    n = 0
    N_begin[0] = 0
    N_end[0] = N - 1
    tol[0] = tol0
    app[0] = (N_end[0] - N_begin[0] + 1) / (2) * ( traj[N_begin[0]] + traj[N_end[0]] )
    iters = 0
    while n > -1: 
        iters += 1
        if (N_end[n] - N_begin[n] + 1) % 2 == 0:
            N_split = int( (N_end[n] - N_begin[n] + 1) / 2 )

        else:
            N_split = math.ceil((N_end[n] - N_begin[n] + 1) / 2)

        old_app = app[n]
        app[n] = (N_split - N_begin[n]) / (2) * ( traj[N_split - 1] + traj[N_begin[n]] )
        app[n+1] = (N_end[n] - N_split + 1) / (2) * ( traj[N_split] + traj[N_end[n]] )
        
        print(iters, traj[N_begin[n]], traj[N_split - 1], traj[N_split], traj[N_end[n]])

        if abs(old_app - (app[n] + app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1]
            n = n - 1

        else:
            N_end[n+1] = N_end[n]
            N_end[n] = N_split - 1
            N_begin[n+1] = N_split 
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
            
    return sums


# Quadrature for Discrete Integrals (i.e. sums)

In [53]:
from adaptive_time.samplers import AdaptiveQuadratureSampler

horizon = 20000
dt = 1
num_steps = horizon - 1
tolerance_init = 0.01

sampler = AdaptiveQuadratureSampler(
    dt=dt,
    num_steps=num_steps,
    tolerance_init=tolerance_init,
    update_when_best=False,
)

def foo2(xs,tol,level):
    c=int(np.floor(len(xs)/2))
    #print(xs, c)
    f = lambda xs: len(xs)*(xs[0]+xs[-1])/2 if len(xs) else 0
    if abs(f(xs) - (r:=f(xs[:c]) + f(xs[c:]))) < tol: return 1, r, [level+c]
    else: 
        x, a, cs_a = foo2(xs[:c],tol/2, level)
        y, b, cs_b = foo2(xs[c:],tol/2, level+c)
        return x+y+1, a+b, cs_a + cs_b + [level+c]

def foo(xs,tol):
    c = int(len(xs) / 2)
    f = lambda xs: len(xs) * ( xs[0] + xs[-1]) / 2 if len(xs) else 0
    if abs(f(xs) - (r := f(xs[:c]) + f(xs[c:]))) < tol: 
        return 1, r
    else: 
        x, fa = foo(xs[:c], tol / 2)
        y, fb = foo(xs[c:], tol / 2)
        return x + y, fa + fb
    

In [56]:
for _ in range(1000):
    traj = np.zeros(horizon)
    traj[0:np.random.randint(horizon)] = -1
    pivots = []
    calls, approx_sum = foo(traj, 0.01)
    print(calls, approx_sum - sum(traj))
    approx_sum, total_seg, num_calls = foo2(traj, 0.01, 0)
    print(total_seg - sum(traj))

14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
11 0.0
0.0
15 0.0
0.0
14 0.0
0.0
11 0.0
0.0
11 0.0
0.0
14 0.0
0.0
15 0.0
0.0
13 0.0
0.0
9 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
14 0.0
0.0
11 0.0
0.0
15 0.0
0.0
14 0.0
0.0
15 0.0
0.0
13 0.0
0.0
15 0.0
0.0
15 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
15 0.0
0.0
12 0.0
0.0
14 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
14 0.0
0.0
12 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
11 0.0
0.0
15 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
14 0.0
0.0
15 0.0
0.0
14 0.0
0.0
15 0.0
0.0
12 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
14 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
15 0.0
0.0
12 0.0
0.0
14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
12 0.0
0.0
14 0.0
0.0
3 0.0
0.0
14 0.0
0.0
14 0.0
0.0
15 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
9 0.0
0.0
12 0.0
0.0
15 0.0
0.0
15 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14 0.0
0.0
14

In [7]:
s = np.ones((10,1))

In [9]:
s.flatten()

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [31]:
A = np.random.normal(size=(4,4))

In [32]:
np.argmax(np.max(A, axis=0))

0

In [33]:
A

array([[ 1.27057081,  0.30776986, -0.86828026, -1.92695095],
       [-0.38579   , -0.77708164, -0.01463299,  1.14070623],
       [-1.18097502, -0.60819445,  0.16693997,  0.98747807],
       [-1.99425564,  0.19283887,  0.12949453,  0.19000738]])

In [34]:
np.max(A)

1.270570809359154