In [1]:
import gym
import numpy as np
from bayes_opt  import BayesianOptimization
import numpy as np
from collections import defaultdict
import random
from collections import deque
import sys
import math

In [2]:
env = gym.make('Taxi-v3')
Q = defaultdict(lambda: np.zeros(env.nA))
num_episodes = 2000

In [3]:
class Agent:

    def __init__(self, epsilon, alpha, gamma, eps_start):
        global Q
        self.nA = 6
        self.Q = Q
        self.eps_start = eps_start
        self.eps_decay = 0.9999
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma

    def q_pros(self, Q_state):
        policy = np.ones(self.nA)*(self.epsilon/self.nA)
        best_arg = np.argmax(Q_state)
        policy[best_arg] = (1 - self.epsilon) + (self.epsilon/self.nA)
        return policy

    def select_action(self, state):
        if random.uniform(0,1) < self.epsilon:
            return random.randint(0,5)
        else:
            return np.argmax(self.Q[state])

    def step(self, state, action, next_state, next_action, next_next_state, next_next_action,reward, reward_next,done):
        if not done:
            reward = reward + self.gamma*reward_next
            new_value = Q[next_state][next_action] + self.gamma*Q[next_next_state][next_next_action]
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward + self.gamma*new_value -self.Q[state][action])
        else:
            self.epsilon = max(self.epsilon*self.eps_decay, self.eps_start)
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward - self.Q[state][action])

In [4]:
def interact(env, agent, num_episodes=20000, window=100):
    global Q
    avg_rewards = deque(maxlen=num_episodes)
    best_avg_reward = -math.inf
    samp_rewards = deque(maxlen=window)
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        samp_reward = 0
        while True:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            next_action = np.random.choice(np.arange(env.nA), p=agent.q_pros(Q[next_state]))
            next_next_state, reward_next, done, _ = env.step(next_action)
            next_next_action = np.random.choice(np.arange(env.nA), p=agent.q_pros(Q[next_next_state]))
            agent.step(state, action, next_state, next_action, next_next_state, next_next_action, reward, reward_next, done)
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward

In [5]:
def interact_wrapper(epsilon, alpha, gamma, eps_start):
    agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma, eps_start=eps_start)
    avg_rewards, best_avg_reward = interact(env, agent, num_episodes)
    return best_avg_reward

In [6]:
pbounds = {'epsilon': (0.01, 0.1), 'alpha': (0.1, 0.5), 'gamma': (0.5, 1.0), 'eps_start': (0.01, 0.2)}

optimizer = BayesianOptimization(
    f=interact_wrapper,
    pbounds=pbounds,
    random_state=47
)

optimizer.probe(
    params={'epsilon': 0.1, 'alpha': 0.1, 'gamma': 0.9, 'eps_start': 0.1},
    lazy=True,
)

optimizer.maximize(
    init_points=4,
    n_iter=25
)

|   iter    |  target   |   alpha   | eps_start |  epsilon  |   gamma   |
-------------------------------------------------------------------------
Episode 2000/2000 || Best average reward -49.962

| [0m 1       [0m | [0m-49.96   [0m | [0m 0.1     [0m | [0m 0.1     [0m | [0m 0.1     [0m | [0m 0.9     [0m |
Episode 2000/2000 || Best average reward -22.24

| [95m 2       [0m | [95m-22.24   [0m | [95m 0.1454  [0m | [95m 0.1952  [0m | [95m 0.07559 [0m | [95m 0.6757  [0m |
Episode 2000/2000 || Best average reward -24.29

| [0m 3       [0m | [0m-24.29   [0m | [0m 0.383   [0m | [0m 0.1619  [0m | [0m 0.0681  [0m | [0m 0.7073  [0m |
Episode 2000/2000 || Best average reward -27.18

| [0m 4       [0m | [0m-27.18   [0m | [0m 0.3824  [0m | [0m 0.05686 [0m | [0m 0.03304 [0m | [0m 0.512   [0m |
Episode 2000/2000 || Best average reward -17.62

| [95m 5       [0m | [95m-17.6    [0m | [95m 0.1395  [0m | [95m 0.06708 [0m | [95m 0.06768 [0m | [95m



Episode 2000/2000 || Best average reward -34.23

| [0m 26      [0m | [0m-34.23   [0m | [0m 0.3614  [0m | [0m 0.08667 [0m | [0m 0.09662 [0m | [0m 0.9105  [0m |
Episode 2000/2000 || Best average reward -55.92

| [0m 27      [0m | [0m-55.92   [0m | [0m 0.1233  [0m | [0m 0.1238  [0m | [0m 0.04872 [0m | [0m 0.935   [0m |
Episode 2000/2000 || Best average reward -32.48

| [0m 28      [0m | [0m-32.4    [0m | [0m 0.3046  [0m | [0m 0.0351  [0m | [0m 0.01771 [0m | [0m 0.7338  [0m |
Episode 2000/2000 || Best average reward -31.67

| [0m 29      [0m | [0m-31.67   [0m | [0m 0.3078  [0m | [0m 0.03101 [0m | [0m 0.04453 [0m | [0m 0.7248  [0m |
Episode 2000/2000 || Best average reward -57.52

| [0m 30      [0m | [0m-57.52   [0m | [0m 0.488   [0m | [0m 0.1644  [0m | [0m 0.09396 [0m | [0m 0.8612  [0m |


In [9]:
import time
import IPython
IPython.display.clear_output()
num_episodes2 = 20000

In [10]:
agent = Agent(alpha = 0.3064 , eps_start = 0.0337, epsilon = 0.04042, gamma = 0.727)
avg_rewards, best_avg_reward = interact(env, agent, num_episodes2) 

Episode 20000/20000 || Best average reward -31.21



In [None]:
 0.3064   |  0.0337   |  0.04042  |  0.727    |

In [None]:
Q[0]