# The purpose of this notebook is for experimenting with code snippets

Monte Carlo in MC

In [1]:
from environment import MountainCar
from features import MountainCarTileCoder
import numpy as np
from utils import argmax

In [2]:
# [DO NOT CHANGE]
tests = [[-1.0, 0.01], [0.1, -0.01], [0.2, -0.05], [-1.0, 0.011], [0.2, -0.05]]

mctc = MountainCarTileCoder(iht_size=1024, num_tilings=8, num_tiles=8)

t = []
for test in tests:
    position, velocity = test
    tiles = mctc.get_tiles(position=position, velocity=velocity)
    t.append(tiles)

print("Your results:")
for tiles in t:
    print(tiles)

print()
print("Expected results:")
expected = """[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]
"""
print(expected)

np.random.seed(1)
mctc_test = MountainCarTileCoder(iht_size=1024, num_tilings=8, num_tiles=8)
test = [mctc_test.get_tiles(np.random.uniform(-1.2, 0.5), np.random.uniform(-0.07, 0.07)) for _ in range(10)]
np.save("tiles_test", test)

Your results:
[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]

Expected results:
[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]



In [17]:
num_actions = 3
iht_size = 1024
num_episodes = 1000
mctc = MountainCarTileCoder(iht_size, num_tilings=8, num_tiles=8)
env = MountainCar()
weights = np.zeros((num_actions, iht_size))
C = np.zeros((num_actions, iht_size))


def generate_trajectory(env, weights, episode, num_actions = 3):
    trajectory = []
    s = env.reset(episode, num_episodes-1)
    position, velocity = s
    tiles = mctc.get_tiles(position, velocity)
    done = False
    while not done:
        action_value = np.zeros(num_actions) #For mountain car
        for action in range(num_actions):
            action_value[action] = np.sum(weights[action][tiles])
        a = argmax(action_value)
        r, s_, _, done = env.step(a)
        position, velocity = s_
        tiles_ = mctc.get_tiles(position, velocity)
        trajectory.append([tiles, a, r, tiles_])
        tiles = tiles_
    return trajectory

    

def monte_carlo(env, weights, num_actions, episode, alpha = 0.5 / 16, iht_size=1024):
    rewards = []
    trajectory = generate_trajectory(env, weights, episode, num_actions)
    T = len(trajectory)
    G = 0
    for t in range(T-1,-1,-1):
        tile, action, reward, _ = trajectory[t]
        G = G + reward
        action_value = np.sum(weights[action][tile])
        weights[action][tile] = weights[action][tile]  +  alpha * (G - action_value)
        rewards.append(reward)
    return weights, sum(rewards)



for episode in range(num_episodes):
    weights, returns = monte_carlo(env, weights, num_actions, episode)
    print(episode, returns)



    

0 -200.0
1 -200.0
2 -200.0
3 -200.0
4 -200.0
5 -200.0
6 -200.0
7 -200.0
8 -200.0
9 -200.0
10 -200.0
11 -200.0
12 -200.0
13 -200.0
14 -200.0
15 -200.0
16 -200.0
17 -200.0
18 -200.0
19 -200.0
20 -200.0
21 -200.0
22 -200.0
23 -200.0
24 -200.0
25 -200.0
26 -200.0
27 -200.0
28 -200.0
29 -200.0
30 -200.0
31 -200.0
32 -200.0
33 -200.0
34 -200.0
35 -200.0
36 -200.0
37 -200.0
38 -200.0
39 -200.0
40 -200.0
41 -200.0
42 -200.0
43 -200.0
44 -200.0
45 -200.0
46 -200.0
47 -200.0
48 -200.0
49 -200.0
50 -200.0
51 -200.0
52 -200.0
53 -200.0
54 -200.0
55 -200.0
56 -200.0
57 -200.0
58 -200.0
59 -200.0
60 -200.0
61 -200.0
62 -200.0
63 -200.0
64 -200.0
65 -200.0
66 -200.0
67 -200.0
68 -200.0
69 -200.0
70 -200.0
71 -200.0
72 -200.0
73 -200.0
74 -200.0
75 -200.0
76 -200.0
77 -200.0
78 -200.0
79 -200.0
80 -200.0
81 -200.0
82 -200.0
83 -200.0
84 -200.0
85 -200.0
86 -200.0
87 -200.0
88 -200.0
89 -200.0
90 -200.0
91 -200.0
92 -200.0
93 -200.0
94 -200.0
95 -200.0
96 -200.0
97 -200.0
98 -200.0
99 -200.0
100 -200.0

# Quadrature

In [4]:
np.sum(weights[1][mctc.get_tiles(-0.5, 0.0)])

-207.81231930852954

In [5]:
def trapeziod_rule(a,b):
    return (b-a) * (function(a) + function(b)) / 2.0

def integral_rule(a,b):
    return trapeziod_rule(a,b)

def function(x):
    return x**5

def adaptive_quadrature(a0, b0, tol0):
    sums = 0.0
    n = 1
    a = np.zeros(100000)
    b = np.zeros(100000)
    tol = np.zeros(100000)
    app = np.zeros(100000)
    iters = 0
    
    a[1] = a0
    b[1] = b0
    tol[1] = tol0
    app[1] = integral_rule(a0,b0)
    
    while n > 0:
        iters += 1
        c = (a[n] + b[n]) / 2
        oldapp = app[n]
        app[n] = integral_rule(a[n], c)
        app[n+1] = integral_rule(c, b[n])
        
        if np.abs(oldapp - (app[n]+app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1] #success
            n = n - 1 #done with interval
            
        else:    #divide into two intervals
            b[n+1] = b[n] #setup new intervals
            b[n] = c  #setup new intervals
            a[n+1] = c #setup new intervals
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
    return sums,iters
        
        

In [6]:
b = 15
truth = b**5/5

quad,iters = adaptive_quadrature(0, b, 0.00005)

h = iters
x = np.linspace(0, b, num=h)
y = function(x)
trap = np.trapz(y,x)

In [7]:
print(np.abs(truth - quad))
print(np.abs(truth - trap))

print(iters - h)

1746562.5000270708
1746562.5000363498
0


In [8]:
h

361341

In [9]:
iters

361341

In [10]:
[-1, 0, 1][2]

1