# The purpose of this notebook is for experimenting with code snippets

Monte Carlo in MC

In [30]:
from adaptive_time.environment import MountainCar
from adaptive_time.features import MountainCarTileCoder
from adaptive_time.utils import argmax

import numpy as np
import time
from tqdm.notebook import tqdm

In [28]:
# [DO NOT CHANGE]
tests = [[-1.0, 0.01], [0.1, -0.01], [0.2, -0.05], [-1.0, 0.011], [0.2, -0.05]]

mctc = MountainCarTileCoder(iht_size=1024, num_tilings=8, num_tiles=8)

t = []
for test in tests:
    position, velocity = test
    tiles = mctc.get_tiles(position=position, velocity=velocity)
    t.append(tiles)

print("Your results:")
for tiles in t:
    print(tiles)

print()
print("Expected results:")
expected = """[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]
"""
print(expected)

np.random.seed(1)
mctc_test = MountainCarTileCoder(iht_size=1024, num_tilings=8, num_tiles=8)
test = [mctc_test.get_tiles(np.random.uniform(-1.2, 0.5), np.random.uniform(-0.07, 0.07)) for _ in range(10)]
np.save("tiles_test", test)

Your results:
[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]

Expected results:
[0 1 2 3 4 5 6 7]
[ 8  9 10 11 12 13 14 15]
[16 17 18 19 20 21 22 23]
[ 0 24  2  3  4  5  6  7]
[16 17 18 19 20 21 22 23]



In [43]:
class SarsaAgent(object):

    def __init__(self):
        self.last_action = None
        self.last_state = None
        self.epsilon = None
        self.gamma = None
        self.iht_size = None
        self.w = None
        self.alpha = None
        self.num_tilings = None
        self.num_tiles = None
        self.mctc = None
        self.initial_weights = None
        self.num_actions = None
        self.previous_tiles = None


        """Setup for the agent called when the experiment first starts."""
        self.num_tilings =  8
        self.num_tiles =  8
        self.iht_size =  4096
        self.epsilon =  0.0
        self.gamma = 1.0
        self.alpha = 0.5 / self.num_tilings
        self.initial_weights =  0.0
        self.num_actions =  3
        
        
        
        # We initialize self.w to three times the iht_size. Recall this is because
        # we need to have one set of weights for each action.
        self.w = np.ones((self.num_actions, self.iht_size)) * self.initial_weights
        
        
        # We initialize self.mctc to the mountaincar verions of the 
        # tile coder that we created
        self.tc = MountainCarTileCoder(iht_size=self.iht_size, 
                                         num_tilings=self.num_tilings, 
                                         num_tiles=self.num_tiles)
        


    
    def select_action(self, tiles):
        """
        Selects an action using epsilon greedy
        Args:
        tiles - np.array, an array of active tiles
        Returns:
        (chosen_action, action_value) - (int, float), tuple of the chosen action
                                        and it's value
        """
        action_values = []
        chosen_action = None
        
        # First loop through the weights of each action and populate action_values
        # with the action value for each action and tiles instance
        
        # Use np.random.random to decide if an exploritory action should be taken
        # and set chosen_action to a random action if it is
        # Otherwise choose the greedy action using the given argmax 
        # function and the action values (don't use numpy's armax)
        
        ### START CODE HERE ###
        action_values = np.zeros(self.num_actions)
        for action in range(self.num_actions):
            action_values[action] = np.sum(self.w[action][tiles])
        chosen_action = argmax(action_values)
        ### END CODE HERE ###
        return chosen_action, action_values[chosen_action]
    

    def agent_start(self, state):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            state (Numpy array): the state observation from the
                environment's evn_start function.
        Returns:
            The first action the agent takes.
        """
        position, velocity = state
        
        # Use self.tc to set active_tiles using position and velocity
        # set current_action to the epsilon greedy chosen action using
        # the select_action function above with the active tiles
        
        ### START CODE HERE ###
        active_tiles = self.tc.get_tiles(position, velocity)
        current_action, _ = self.select_action(active_tiles)
        ### END CODE HERE ###
        
        self.last_action = current_action
        self.previous_tiles = np.copy(active_tiles)
        return self.last_action
    

    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (Numpy array): the state observation from the
                environment's step based, where the agent ended up after the
                last step
        Returns:
            The action the agent is taking.
        """
        # choose the action here
        position, velocity = state
        
        # Use self.tc to set active_tiles using position and velocity
        # set current_action and action_value to the epsilon greedy chosen action using
        # the select_action function above with the active tiles
        
        # Update self.w at self.previous_tiles and self.previous action
        # using the reward, action_value, self.gamma, self.w,
        # self.alpha, and the Sarsa update from the textbook
        
        ### START CODE HERE ###
        active_tiles = self.tc.get_tiles(position, velocity)
        current_action, action_value = self.select_action(active_tiles)
        last_action_value = np.sum(self.w[self.last_action][self.previous_tiles])
        self.w[self.last_action][self.previous_tiles] += self.alpha * (reward + self.gamma * action_value - last_action_value) * 1
        
        
        self.last_action = current_action
        self.previous_tiles = np.copy(active_tiles)
        return self.last_action
    

    def agent_end(self, reward):
        """Run when the agent terminates. Same as above except action_value = 0.0
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """
        # Update self.w at self.previous_tiles and self.previous action
        # using the reward, self.gamma, self.w,
        # self.alpha, and the Sarsa update from the textbook
        # Hint - there is no action_value used here because this is the end
        # of the episode.
        
        ### START CODE HERE ###
        last_action_value = np.sum(self.w[self.last_action][self.previous_tiles])
        
        
        self.w[self.last_action][self.previous_tiles] += self.alpha * (reward - last_action_value) * 1
    

    


In [46]:
num_episodes = 200

agent = SarsaAgent()
env = MountainCar()
for episode in range(num_episodes):
    rewards = []
    s = env.reset()
    a = agent.agent_start(s)
    done = False
    while not done:
        r, s_, _, done = env.step(a)
        if done == True:
            agent.agent_end(r)
        else:
            s = s_
            a = agent.agent_step(r, s)
        rewards.append(r)
    print(episode, sum(rewards))
            


0 -200.0
1 -200.0
2 -200.0
3 -200.0
4 -200.0
5 -200.0
6 -200.0
7 -200.0
8 -200.0
9 -200.0
10 -200.0
11 -200.0
12 -200.0
13 -200.0
14 -200.0
15 -200.0
16 -200.0
17 -200.0
18 -200.0
19 -200.0
20 -200.0
21 -200.0
22 -185.0
23 -200.0
24 -200.0
25 -175.0
26 -200.0
27 -200.0
28 -200.0
29 -200.0
30 -200.0
31 -200.0
32 -169.0
33 -200.0
34 -200.0
35 -200.0
36 -173.0
37 -200.0
38 -200.0
39 -200.0
40 -200.0
41 -195.0
42 -191.0
43 -173.0
44 -189.0
45 -200.0
46 -184.0
47 -179.0
48 -177.0
49 -174.0
50 -176.0
51 -168.0
52 -166.0
53 -200.0
54 -200.0
55 -200.0
56 -200.0
57 -175.0
58 -200.0
59 -200.0
60 -200.0
61 -199.0
62 -198.0
63 -175.0
64 -172.0
65 -161.0
66 -159.0
67 -159.0
68 -200.0
69 -154.0
70 -200.0
71 -137.0
72 -200.0
73 -161.0
74 -169.0
75 -155.0
76 -200.0
77 -200.0
78 -200.0
79 -200.0
80 -200.0
81 -200.0
82 -165.0
83 -200.0
84 -200.0
85 -200.0
86 -200.0
87 -200.0
88 -200.0
89 -200.0
90 -200.0
91 -200.0
92 -200.0
93 -200.0
94 -118.0
95 -117.0
96 -116.0
97 -115.0
98 -115.0
99 -121.0
100 -155.0

In [None]:
def generate_episode(env, )

def gradient_MC(env, policy, ep, gamma, model, callback=None, trace=None):
    """Gradient Monte Carlo Algorithm
    
    Params:
        env    - environment
        policy - function in a form: policy(state)->action
        ep     - number of episodes to run
        gamma  - discount factor [0..1]
        model  - function approximator, already initialised, with method:
                     train(state, target) -> None
        callback - function in a form: callback(episode, model, trace) -> None
        trace  - passed to callback, so it can log data into it
    """
    for e_ in range(ep):
        traj, T = generate_episode(env, policy)
        Gt = 0
        for t in range(T-1,-1,-1):
            St, _, _, _ = traj[t]      # (st, rew, done, act)
            _, Rt_1, _, _ = traj[t+1]
            
            Gt = gamma * Gt + Rt_1
            model.train(St, Gt)
               
        if callback is not None:
            callback(e_, model, trace)

# Quadrature

In [173]:
def trapeziod_rule(a,b):
    return (b-a) * (function(a) + function(b)) / 2.0

def integral_rule(a,b):
    return trapeziod_rule(a,b)

def function(x):
    return x**5

def adaptive_quadrature(a0, b0, tol0):
    sums = 0.0
    n = 1
    a = np.zeros(100000)
    b = np.zeros(100000)
    tol = np.zeros(100000)
    app = np.zeros(100000)
    iters = 0
    
    a[1] = a0
    b[1] = b0
    tol[1] = tol0
    app[1] = integral_rule(a0,b0)
    
    while n > 0:
        iters += 1
        c = (a[n] + b[n]) / 2
        oldapp = app[n]
        app[n] = integral_rule(a[n], c)
        app[n+1] = integral_rule(c, b[n])
        
        if np.abs(oldapp - (app[n]+app[n+1])) < 3 * tol[n]:
            sums = sums + app[n] + app[n+1] #success
            n = n - 1 #done with interval
            
        else:    #divide into two intervals
            b[n+1] = b[n] #setup new intervals
            b[n] = c  #setup new intervals
            a[n+1] = c #setup new intervals
            tol[n] = tol[n] / 2
            tol[n+1] = tol[n]
            n = n + 1
    return sums,iters
        
        

In [174]:
b = 15
truth = b**5/5

quad,iters = adaptive_quadrature(0, b, 0.00005)

h = iters
x = np.linspace(0, b, num=h)
y = function(x)
trap = np.trapz(y,x)

AxisError: axis 10 is out of bounds for array of dimension 0

In [172]:
print(np.abs(truth - quad))
print(np.abs(truth - trap))

print(iters - h)

151738.20744045908
3.5363336792215705e-05
0


In [168]:
h

84605

In [169]:
iters

84605