In [134]:
# Import common libraries
import sys
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random 
import math
from bayes_opt  import BayesianOptimization

# Set plotting options
%matplotlib inline
plt.style.use('ggplot')
np.set_printoptions(precision=3, linewidth=120)

In [135]:
env = gym.make('Acrobot-v1')
env.seed(505);



In [136]:
def create_tilings(lows, highs, tiling_specs):
    # TODO: Implement this
    final_gird = []
    final_gird2 = []
    for j in range(len(tiling_specs)):
        final_gird = []
        for i in range(len(lows)):
            init = np.linspace(lows[i], highs[i], tiling_specs[j][0][0], endpoint=False)[1:] + tiling_specs[j][1][i]
            final_gird.append(init)
        final_gird2.append(final_gird)
    #return np.array(final_gird2,dtype=np.float32)
    return np.array(final_gird2)


# Tiling specs: [(<bins>, <offsets>), ...]
tiling_specs = [((10, 10, 10), (-0.099, -0.066, -0.033, -0.022, -0.011, -0.005)),
                ((10, 10, 10), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0)),
                ((10, 10, 10), (0.099, 0.066, 0.033, 0.022, 0.011, 0.005))]
tilings = create_tilings(env.observation_space.low, env.observation_space.high, tiling_specs)
print(tilings)

[[[-8.990e-01 -6.990e-01 -4.990e-01 -2.990e-01 -9.900e-02  1.010e-01  3.010e-01  5.010e-01  7.010e-01]
  [-8.660e-01 -6.660e-01 -4.660e-01 -2.660e-01 -6.600e-02  1.340e-01  3.340e-01  5.340e-01  7.340e-01]
  [-8.330e-01 -6.330e-01 -4.330e-01 -2.330e-01 -3.300e-02  1.670e-01  3.670e-01  5.670e-01  7.670e-01]
  [-8.220e-01 -6.220e-01 -4.220e-01 -2.220e-01 -2.200e-02  1.780e-01  3.780e-01  5.780e-01  7.780e-01]
  [-1.006e+01 -7.551e+00 -5.038e+00 -2.524e+00 -1.100e-02  2.502e+00  5.016e+00  7.529e+00  1.004e+01]
  [-2.262e+01 -1.697e+01 -1.131e+01 -5.660e+00 -5.000e-03  5.650e+00  1.130e+01  1.696e+01  2.261e+01]]

 [[-8.000e-01 -6.000e-01 -4.000e-01 -2.000e-01  0.000e+00  2.000e-01  4.000e-01  6.000e-01  8.000e-01]
  [-8.000e-01 -6.000e-01 -4.000e-01 -2.000e-01  0.000e+00  2.000e-01  4.000e-01  6.000e-01  8.000e-01]
  [-8.000e-01 -6.000e-01 -4.000e-01 -2.000e-01  0.000e+00  2.000e-01  4.000e-01  6.000e-01  8.000e-01]
  [-8.000e-01 -6.000e-01 -4.000e-01 -2.000e-01  0.000e+00  2.000e-01  4

In [137]:
def discretize(sample, grid):
    # TODO: Implement this
    final_value = []
    for i in range(len(grid)):
        init = np.digitize(sample[i], grid[i], right = True)
        final_value.append(int(init))
    return final_value


def tile_encode(sample, tilings, flatten=False):
    # TODO: Implement this
    pass
    vector_bin = []
    for i in range(len(tilings)):
        value = discretize(sample, tilings[i])
        vector_bin.append(value)
    return vector_bin

In [138]:
class QTable:

    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.q_table = np.zeros(state_size + (action_size,))
        print("QTable(): size =", self.q_table.shape)

class TiledQTable:
    """Composite Q-table with an internal tile coding scheme."""
    
    def __init__(self, low, high, tiling_specs, action_size):
        self.tilings = create_tilings(low, high, tiling_specs)
        self.state_sizes = [tuple(len(splits)+1 for splits in tiling_grid) for tiling_grid in self.tilings]
        self.action_size = action_size
        self.q_tables = [QTable(state_size, self.action_size) for state_size in self.state_sizes]
        print("TiledQTable(): no. of internal tables = ", len(self.q_tables))
    
    def get(self, state, action):
        # TODO: Encode state to get tile indices
        data = tile_encode(state, self.tilings)
        #print(state, action, data)
        val = 0
        # TODO: Retrieve q-value for each tiling, and return their average
        for i in range(len(data)):
            val += self.q_tables[i].q_table[tuple(data[i]) + (action,)]
        val /= len(data)
        return val
        
        
    def update(self, state, action, value, alpha=0.1):
        # TODO: Encode state to get tile indices
        data = tile_encode(state, self.tilings)
        # TODO: Update q-value for each tiling by update factor alpha
        for i in range(len(data)):
            temp = self.q_tables[i].q_table[tuple(data[i]) + (action,)]
            self.q_tables[i].q_table[tuple(data[i]) + (action,)] = alpha*value + (1.0 - alpha)*temp
        

In [139]:
high = env.observation_space.high
low = env.observation_space.low
tq = TiledQTable(low, high, tiling_specs, env.action_space.n)

QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3


In [140]:
import random
class Agent:
    
    def __init__(self, epsilon, alpha, gamma, eps_start):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon;
        self.eps_decay = 0.9999
        self.eps_start = eps_start
        self.high = env.observation_space.high
        self.low = env.observation_space.low
        self.nA = env.action_space.n;
        self.tq = TiledQTable(low, high, tiling_specs, self.nA)
        
    def q_probs(self, state):
        policy = np.ones((env.action_space.n))*(self.epsilon/self.nA)
        posible_choice = []
        for i in range(self.nA):
            posible_choice.append(self.tq.get(state, i))
        best_pos = np.argmax(posible_choice)
        policy[best_pos] = (1 - self.epsilon) + self.epsilon/self.nA
        return policy
    
    def get_action(self,state):
        if random.uniform(0,1) < self.epsilon:
            return env.action_space.sample()
        else:
            return np.random.choice(np.arange(self.nA), p=self.q_probs(state))
        
    def update(self, state, action, reward, next_state, done):
        old_val = tq.get(state, action)
        if not done:
            next_action = self.get_action(next_state)
            #value = old_val + self.alpha*(reward + self.gamma*self.tq.get(next_state, next_action) - old_val)
            value = reward + self.gamma*self.tq.get(next_state, action) - old_val
        if done:
            #value = old_val + self.alpha*(reward - old_val)
            self.epsilon = max(self.epsilon*self.eps_decay, self.eps_start)
            value = reward - old_val
        #print(value)
        self.tq.update(state, action, value, self.alpha)
        #print("state {}, action {}, value {} :".format(state, action, self.tq.get(state,action)))


In [141]:
def interact(env, agent, num_episodes=1000, window=100):
    avg_rewards = deque(maxlen=num_episodes)
    best_avg_reward = -math.inf
    samp_rewards = deque(maxlen=window)
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        samp_reward = 0
        while True:
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.update(state, action, reward, next_state, done)
            state = next_state
            samp_reward += reward
            if done:
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            avg_reward = np.mean(samp_rewards)
            avg_rewards.append(avg_reward)
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward

In [142]:
#avg_rewards, best_avg_reward = interact(env, agent)

In [143]:
num_episodes = 1000
def interact_wrapper(epsilon, alpha, gamma, eps_start):
    agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma, eps_start=eps_start)
    avg_rewards, best_avg_reward = interact(env, agent, num_episodes)
    return best_avg_reward

In [144]:
pbounds = {'epsilon': (0.01, 0.1), 'alpha': (0.1, 0.5), 'gamma': (0.5, 1.0), 'eps_start': (0.01, 0.2)}

optimizer = BayesianOptimization(
    f=interact_wrapper,
    pbounds=pbounds,
    random_state=47
)

optimizer.probe(
    params={'epsilon': 0.1, 'alpha': 0.1, 'gamma': 0.9, 'eps_start': 0.1},
    lazy=True,
)

optimizer.maximize(
    init_points=4,
    n_iter=25
)

|   iter    |  target   |   alpha   | eps_start |  epsilon  |   gamma   |
-------------------------------------------------------------------------
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3
Episode 1000/1000 || Best average reward -381.82

| [0m 1       [0m | [0m-381.8   [0m | [0m 0.1     [0m | [0m 0.1     [0m | [0m 0.1     [0m | [0m 0.9     [0m |
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3
Episode 1000/1000 || Best average reward -329.86

| [95m 2       [0m | [95m-329.9   [0m | [95m 0.1454  [0m | [95m 0.1952  [0m | [95m 0.07559 [0m | [95m 0.6757  [0m |
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable

QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3
Episode 1000/1000 || Best average reward -236.18

| [95m 24      [0m | [95m-236.2   [0m | [95m 0.4322  [0m | [95m 0.02151 [0m | [95m 0.06262 [0m | [95m 0.6768  [0m |
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3
Episode 1000/1000 || Best average reward -249.15

| [0m 25      [0m | [0m-249.2   [0m | [0m 0.3844  [0m | [0m 0.1263  [0m | [0m 0.05278 [0m | [0m 0.5438  [0m |
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3
Episode 1000/1000 || Best average reward -310.57

| [0m 26      [0m | [0m-310.6   [0m | [0m 0.3789  [0m | [0m 0.05971 [0m | [0m 0.04352 [0m | [0m 0.51

KeyboardInterrupt: 

In [145]:
agent = Agent(epsilon=0.06262, alpha=0.4322, gamma=0.6768, eps_start=0.02151)
avg_rewards, best_avg_reward = interact(env, agent, 10000)

QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
QTable(): size = (10, 10, 10, 10, 10, 10, 3)
TiledQTable(): no. of internal tables =  3
Episode 10000/10000 || Best average reward -271.02



In [147]:
tilings = create_tilings(low, high, tiling_specs)
entry = tile_encode([ 0.778, 0.628,  0.097,-0.995, 1.593, -1.5  ], tilings)
entry

[[9, 8, 5, 0, 5, 4], [8, 8, 5, 0, 5, 4], [8, 7, 5, 0, 5, 4]]

In [148]:
agent.tq.q_tables[0].q_table[tuple(entry[0])]

array([-3.09 , -3.092, -3.082])

In [149]:
reward2 = []
for i in range(3):
    state = env.reset()
    rewards = 0
    while True:
        env.render()
        posible_choice = []
        for i in range(env.action_space.n):
            posible_choice.append(agent.tq.get(state, i))
        action = np.argmax(posible_choice)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        rewards += reward
        if done:
            reward2.append(rewards)
            break
env.close()
print(reward2)

[-500.0, -500.0, -500.0]
