# The purpose of this notebook is for experimenting with code snippets

Monte Carlo in MC

In [1]:
from adaptive_time.environment import MountainCar
from adaptive_time.features import MountainCarTileCoder
import numpy as np
from adaptive_time.utils import argmax

In [2]:
# [DO NOT CHANGE]
tests = [[-1.0, 0.01], [0.1, -0.01], [0.2, -0.05], [-1.0, 0.011], [0.2, -0.05]]

mctc = MountainCarTileCoder(iht_size=1024, num_tilings=8, num_tiles=8)

t = []
for test in tests:
    position, velocity = test
    tiles = mctc.get_tiles(position=position, velocity=velocity)
    t.append(tiles)

print("Your results:")
for tiles in t:
    print(tiles)

print()
print("Expected results:")
expected = """[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]
"""
print(expected)

np.random.seed(1)
mctc_test = MountainCarTileCoder(iht_size=1024, num_tilings=8, num_tiles=8)
test = [mctc_test.get_tiles(np.random.uniform(-1.2, 0.5), np.random.uniform(-0.07, 0.07)) for _ in range(10)]
np.save("tiles_test", test)

Your results:
[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]

Expected results:
[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]



In [3]:
num_actions = 3
iht_size = 1024
num_episodes = 1000
mctc = MountainCarTileCoder(iht_size, num_tilings=8, num_tiles=8)
env = MountainCar()
weights = np.zeros((num_actions, iht_size))
C = np.zeros((num_actions, iht_size))


def integral_rule(a, b, ra, rb):
    return (b-a) * (ra + rb) / 2.0

def adaptive_quadrature(traj, tol0=pow(10,-3), a0 = 0, b0 = 1):
    N = len(traj)
    sums = 0.0
    n = 1
    a = np.zeros(N+1)
    b = np.zeros(N+1)
    tol = np.zeros(N+1)
    app = np.zeros(N+1)
    iters = 0
    points = []
    
    a[1] = a0
    b[1] = b0
    tol[1] = tol0
    app[1] = integral_rule(a0,b0)
    while n > 0 or n < N:
        iters += 1
        c = int( n * (a[n] + b[n]) / 2 )
        points.append(c)
        oldapp = app[n]
        a = int(a[n] * N)
        b = int(b[n] * N)
        points.append(a,c,b)
        app[n] = integral_rule(a, c, traj[a][2], traj[c][2])
        app[n+1] = integral_rule(c, b, traj[c][2], traj[b][2])
        
        if np.abs(oldapp - (app[n]+app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1] #success
            n = n - 1 #done with interval
            
        else:    #divide into two intervals
            b[n+1] = b[n] #setup new intervals
            b[n] = c  #setup new intervals
            a[n+1] = c #setup new intervals
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
    return sums, iters


def generate_trajectory(env, weights, episode, num_actions = 3):
    trajectory = []
    s = env.reset(episode, num_episodes-1)
    position, velocity = s
    tiles = mctc.get_tiles(position, velocity)
    done = False
    while not done:
        action_value = np.zeros(num_actions) #For mountain car
        for action in range(num_actions):
            action_value[action] = np.sum(weights[action][tiles])
        a = argmax(action_value)
        r, s_, _, done = env.step(a)
        position, velocity = s_
        tiles_ = mctc.get_tiles(position, velocity)
        trajectory.append([tiles, a, r, tiles_])
        tiles = tiles_
    return trajectory

    

def monte_carlo(env, weights, num_actions, episode, alpha = 0.5 / 16, iht_size=1024):
    rewards = []
    trajectory = generate_trajectory(env, weights, episode, num_actions)
    N = len(trajectory)
    G, h = adaptive_quadrature(trajectory, N)
    for t in range(N-1,-1,-1):
        tile, action, reward, _ = trajectory[t]
        G = G + reward
        action_value = np.sum(weights[action][tile])
        weights[action][tile] = weights[action][tile]  +  alpha * (G - action_value)
        rewards.append(reward)
    return weights, sum(rewards)



for episode in range(1):
    weights, returns = monte_carlo(env, weights, num_actions, episode)
    print(episode, returns)



    

TypeError: reset() takes from 1 to 2 positional arguments but 3 were given

# Quadrature

In [4]:
np.sum(weights[1][mctc.get_tiles(-0.5, 0.0)])

0.0

In [5]:
def trapeziod_rule(a,b):
    return (b-a) * (function(a) + function(b)) / 2.0

def integral_rule(a,b):
    return trapeziod_rule(a,b)

def function(x):
    return x**5

def adaptive_quadrature(a0, b0, tol0):
    sums = 0.0
    n = 1
    a = np.zeros(100000)
    b = np.zeros(100000)
    tol = np.zeros(100000)
    app = np.zeros(100000)
    iters = 0
    
    a[1] = a0
    b[1] = b0
    tol[1] = tol0
    app[1] = integral_rule(a0,b0)
    
    while n > 0:
        iters += 1
        c = (a[n] + b[n]) / 2
        oldapp = app[n]
        app[n] = integral_rule(a[n], c)
        app[n+1] = integral_rule(c, b[n])
        
        if np.abs(oldapp - (app[n]+app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1] #success
            n = n - 1 #done with interval
            
        else:    #divide into two intervals
            b[n+1] = b[n] #setup new intervals
            b[n] = c  #setup new intervals
            a[n+1] = c #setup new intervals
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
    return sums,iters
        
        

In [6]:
b = 15
truth = b**5/5

quad,iters = adaptive_quadrature(0, b, 0.00005)

h = iters
x = np.linspace(0, b, num=h)
y = function(x)
trap = np.trapz(y,x)

In [7]:
print(np.abs(truth - quad))
print(np.abs(truth - trap))

print(iters - h)

1746562.5000270708
1746562.5000363498
0


In [11]:
import math
def adaptive_sum(traj, tol0=10**(-2)):
    steps = {}
    N = len(traj)
    app = np.zeros(N+1000)
    tol = np.zeros(N+1000)
    N_begin = np.ones(N+1000,dtype=np.int8) * int(-1)
    N_end = np.ones(N+1000,dtype=np.int8) * int(-1)

    sums = 0.0


    steps[N_begin[0]] = 1
    steps[N_end[0]] = 1

    n = 0
    N_begin[0] = 0
    N_end[0] = N - 1
    tol[0] = tol0
    app[0] = (N_end[0] - N_begin[0] + 1) / (2) * ( traj[N_begin[0]] + traj[N_end[0]] )
    iters = 0
    while n > -1: 
        iters += 1
        if (N_end[n] - N_begin[n] + 1) % 2 == 0:
            N_split = int( (N_end[n] - N_begin[n] + 1) / 2 )

        else:
            N_split = math.ceil((N_end[n] - N_begin[n] + 1) / 2)

        old_app = app[n]
        app[n] = (N_split - N_begin[n]) / (2) * ( traj[N_split - 1] + traj[N_begin[n]] )
        app[n+1] = (N_end[n] - N_split + 1) / (2) * ( traj[N_split] + traj[N_end[n]] )
        
        print(iters, traj[N_begin[n]], traj[N_split - 1], traj[N_split], traj[N_end[n]])

        if abs(old_app - (app[n] + app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1]
            n = n - 1

        else:
            N_end[n+1] = N_end[n]
            N_end[n] = N_split - 1
            N_begin[n+1] = N_split 
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
            
    return sums


In [71]:
def foo(xs,tol):
    c=int(len(xs)/2)
    #print(xs, c)
    f = lambda xs: len(xs)*(xs[0]+xs[-1])/2 if len(xs) else 0
    if abs(f(xs) - (r:=f(xs[:c]) + f(xs[c:]))) < tol: return 1, r
    else: 
        x, a = foo(xs[:c],tol/2)
        y, b = foo(xs[c:],tol/2)
        return x+y, a+b
    

In [75]:
for _ in range(1000):
    traj = np.zeros(20000)
    traj[0:np.random.randint(20000)] = -1
    calls, approx_sum = foo(traj,0.01)
    print(calls, approx_sum - sum(traj))

9 0.0
14 0.0
15 0.0
13 0.0
15 0.0
13 0.0
14 0.0
15 0.0
14 0.0
12 0.0
14 0.0
15 0.0
15 0.0
12 0.0
13 0.0
14 0.0
14 0.0
14 0.0
15 0.0
12 0.0
14 0.0
15 0.0
14 0.0
10 0.0
14 0.0
14 0.0
13 0.0
15 0.0
12 0.0
12 0.0
15 0.0
12 0.0
15 0.0
12 0.0
14 0.0
15 0.0
13 0.0
14 0.0
14 0.0
14 0.0
15 0.0
14 0.0
9 0.0
14 0.0
15 0.0
15 0.0
12 0.0
11 0.0
13 0.0
12 0.0
15 0.0
14 0.0
15 0.0
12 0.0
14 0.0
15 0.0
14 0.0
12 0.0
15 0.0
14 0.0
9 0.0
12 0.0
15 0.0
15 0.0
14 0.0
11 0.0
14 0.0
15 0.0
15 0.0
12 0.0
11 0.0
14 0.0
15 0.0
13 0.0
14 0.0
15 0.0
15 0.0
14 0.0
15 0.0
14 0.0
15 0.0
12 0.0
15 0.0
12 0.0
14 0.0
13 0.0
11 0.0
13 0.0
14 0.0
14 0.0
13 0.0
14 0.0
12 0.0
12 0.0
11 0.0
12 0.0
15 0.0
15 0.0
15 0.0
12 0.0
15 0.0
15 0.0
14 0.0
14 0.0
14 0.0
15 0.0
14 0.0
15 0.0
14 0.0
15 0.0
14 0.0
14 0.0
14 0.0
12 0.0
14 0.0
14 0.0
15 0.0
15 0.0
9 0.0
14 0.0
14 0.0
15 0.0
15 0.0
14 0.0
14 0.0
11 0.0
15 0.0
14 0.0
14 0.0
15 0.0
12 0.0
14 0.0
14 0.0
15 0.0
12 0.0
14 0.0
14 0.0
14 0.0
14 0.0
13 0.0
15 0.0
12 0.0
15 0.0
15 

-5000.0