In [1]:
from collections import deque, defaultdict
import sys, math, random
import numpy as np
import gym

In [2]:
def interact(env, agent, num_episodes=20000, window=100):
    avg_rewards = deque(maxlen=num_episodes)
    best_avg_reward = -math.inf
    samp_rewards = deque(maxlen=window)
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        samp_reward = 0
        epsilon = 1.0 / i_episode
        while True:
            action = agent.select_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done, epsilon)
            samp_reward += reward
            state = next_state
            if done:
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            avg_reward = np.mean(samp_rewards)
            avg_rewards.append(avg_reward)
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward

In [3]:
class Agent:

    def __init__(self, nA=6, gamma=1.0, alpha=0.1):
        self.nA = nA
        self.gamma = gamma
        self.alpha = alpha
        self.Q = defaultdict(lambda: np.zeros(self.nA))

    def select_action(self, state, epsilon):
        if random.random() > epsilon: # select greedy action with probability epsilon
            return np.argmax(self.Q[state])
        else:
            return np.random.choice(self.nA)

    def step(self, state, action, reward, next_state, done, epsilon):
        one_step_reward = reward * self.gamma
        old_q = self.Q[state][action]
        if done:
            self.Q[state][action] += self.alpha * (one_step_reward - old_q)
        else:
            next_action = self.select_action(next_state, epsilon)
            next_q = self.Q[next_state][next_action]
            self.Q[state][action] += self.alpha * (one_step_reward + next_q - old_q)

In [4]:
env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

Episode 20000/20000 || Best average reward 9.426



In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20, 10))
plt.plot(np.exp(avg_rewards))
plt.show()