### TODO
* Explain: tabular
* Explain: Q-learning
* Explain: SARSA
* Explain: n-step methods
* Explain: cliff env.


### DONE
* code: Run and compare (on cliff)


### NOTES
* Example 6.6: Cliff Walking


# Tabular Methods for Reinforcement Learning


In [None]:
%matplotlib inline

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from IPython.display import clear_output
import numpy as np
import matplotlib.pyplot as plt

import utils
from cliff import Cliff
from agents import TabularNStepSARSA, TabularNStepQLearning

In [None]:
def run_loop(env, agent, title, max_e=None):
    t = 0; i = 0; e = 0
    s, r, d, _ = env.reset()
    a_ = agent.action(s)
    ep_lens = []; rewards = []
    r_sum = 0
    since_last_plot = 0

    while True:
        i += 1; t += 1; since_last_plot += 1
        a = a_
        s_, r, d, _ = env.step(a)
        a_ = agent.action(s_)

        agent.update(s=s, a=a, r=r, s_=s_, a_=a_, d=d)
        r_sum += r
        s = np.copy(s_)

        if (e + 1) % 5000 == 0:
            with utils.RunningPlot():
                plt.figure(1, figsize=(4, 4))
                clear_output(True)
                plt.imshow(env.render())
                plt.title('Episode:{}, step: {}'.format(e, i))

        if d or i > 1e6:
            if since_last_plot > 1e4:
                with utils.RunningPlot():
                    since_last_plot = 0
                    clear_output(wait=True)
                    plt.figure(1, figsize=(8, 4))
                    plt.suptitle(title, x=0.1, y=1, fontsize=20, horizontalalignment='left')

                    plt.subplot(121)
                    plt.title('Highest action value')
                    img1 = plt.imshow(np.max(agent.Qtable, -1))
                    plt.axis('equal', frameon=True)
                    utils.colorbar(img1)

                    plt.subplot(122)
                    plt.title('Movement Heatmap')
                    img2 = plt.imshow(env.heat_map)
                    plt.axis('equal')
                    utils.colorbar(img2)

            ep_lens.append(i)
            rewards.append(r_sum)
            r_sum = 0; e += 1; i = 0
            s, r, d, _ = env.reset()

        if max_e and e >= max_e:
            break

    return ep_lens, rewards

In [None]:
num_runs = 10
eps_per_run = 500
n = 2

In [None]:
TN_QLearning_rewards = []
env = Cliff()
for i in range(num_runs):
    TN_QLearning = TabularNStepQLearning(env.state_shape, env.num_actions, n=n)
    _, rewards = run_loop(env, TN_QLearning, 'TabularNstepQLearning, n='+str(n), max_e=eps_per_run)
    TN_QLearning_rewards.append(rewards)

TN_QLearning_rewards = np.array(TN_QLearning_rewards)

In [None]:
TN_SARSA_rewards = []
env = Cliff()
for i in range(num_runs):
    TN_SARSA = TabularNStepSARSA(env.state_shape, env.num_actions, n=n)
    _, rewards = run_loop(env, TN_SARSA, 'TabularNstepSARSA, n='+str(n), max_e=eps_per_run)
    TN_SARSA_rewards.append(rewards)

TN_SARSA_rewards = np.array(TN_SARSA_rewards)

In [None]:
plt.figure()
utils.reward_plotter(TN_SARSA_rewards, 'TabularNstepSARSA', 'b')
utils.reward_plotter(TN_QLearning_rewards, 'TabularNstepQLearning', 'r')

axes = plt.gca()
axes.set_ylim([-100, 0])

plt.show()