In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from collections import defaultdict

import sys
if "../" not in sys.path:
  sys.path.append("../")

import gym
import plotting
from lib.envs.cliff_walking import CliffWalkingEnv
import itertools
matplotlib.style.use('ggplot')

In [2]:
env = CliffWalkingEnv()

In [12]:
# Create a random epsilon-greedy policy

def random_epsilon_greedy_policy(Q1, Q2, epsilon, state, nA):
    A = np.ones(nA, dtype=float) * epsilon / nA
    best_action = np.argmax(Q1[state]+Q2[state])
    A[best_action] += (1.0 - epsilon)
    return A

In [23]:
def double_q_learning(env, num_episodes, discount=1.0, alpha=0.5, epsilon=0.1, debug=False):
    
    Q1 = defaultdict(lambda: np.zeros(env.action_space.n, dtype=float))
    Q2 = defaultdict(lambda: np.zeros(env.action_space.n, dtype=float))
    Q = defaultdict(lambda: np.zeros(env.action_space.n, dtype=float))
    
    episode_lengths = defaultdict(float)
    episode_rewards = defaultdict(float)
    

    for i_episode in range(1, num_episodes+1):
        
        if debug:
            if i_episode % 100 == 0:
                print("\rEpisode {}/{}.".format(i_episode, num_episodes))
                
        state = env.reset()
        for t in itertools.count():
            action_probs = random_epsilon_greedy_policy(Q1, Q2, epsilon, state, env.action_space.n)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            
            next_state, reward, end, _ = env.step(action)
            
            if np.random.choice([0,1]) == 0:
                Q1[state][action] += alpha * (reward + discount*Q2[next_state][np.argmax(Q1[next_state][:])] - Q1[state][action])
            else:
                Q2[state][action] += alpha * (reward + discount*Q1[next_state][np.argmax(Q2[next_state][:])] - Q2[state][action])
            
            episode_rewards[i_episode] += reward
            episode_lengths[i_episode] = t
            
            Q[state][action] = (Q1[state][action] + Q2[state][action])/2
            if end:
                break
            state = next_state
            
    return Q, episode_lengths, episode_rewards

In [24]:
Q, episode_lengths, episode_rewards = double_q_learning(env, 500, debug=True)

Episode 100/500.
Episode 200/500.
Episode 300/500.
Episode 400/500.
Episode 500/500.


In [25]:
Q

defaultdict(<function __main__.<lambda>>,
            {0: array([-86.25282999, -23.44744607, -67.90837437, -85.79483728]),
             1: array([-82.84831388, -13.01172846, -83.37924278, -83.83748805]),
             2: array([-22.707259  , -12.        , -27.93397581, -18.3378195 ]),
             3: array([-14.16415677, -11.        , -38.10387981, -26.53612156]),
             4: array([-22.10994281, -10.        , -25.54909169, -17.74410247]),
             5: array([-20.01943094, -58.29430647,  -9.        , -43.71043444]),
             6: array([-53.84787044, -24.23836684, -65.35721588, -66.69837795]),
             7: array([-61.03635374, -58.99763827,  -7.10116709, -60.82662748]),
             8: array([-53.6463428 , -54.50774581, -54.3417167 , -58.12313906]),
             9: array([-47.41744092, -47.96204128, -42.73591552, -47.78883778]),
             10: array([-40.58736277, -38.45627719, -38.64294958, -42.33035229]),
             11: array([-32.491102  , -34.57036495, -32.72227465, 