In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from collections import defaultdict

import sys
if "../" not in sys.path:
  sys.path.append("../")

import gym
import plotting
from lib.envs.gridworld import GridworldEnv
import itertools
matplotlib.style.use('ggplot')

In [2]:
env = GridworldEnv()

In [3]:
#Create an initial epsilon soft policy
def epsilon_greedy_policy(Q, epsilon, state, nA):
    A = np.ones(nA, dtype=float) * epsilon / nA
    best_action = np.argmax(Q[state])
    A[best_action] += (1.0 - epsilon)
    return A

In [4]:
def online_q_learning_lambda(env, num_episodes, discount=1.0, epsilon=0.1, alpha=0.5, lbda=0.9, debug=False):
    
    Q = defaultdict(lambda: np.zeros(env.action_space.n, dtype=float))
    
    for i_episode in range(1, num_episodes+1):
        
        if debug:
            if i_episode % 1000 == 0:
                print("\rEpisode {}/{}.".format(i_episode, num_episodes))
                
        E = {key:np.zeros(4, dtype=int) for key in np.arange(16)}
        state = env.reset()
        action_probs = epsilon_greedy_policy(Q, epsilon, state, env.nA)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        for t in itertools.count():
            next_state, reward, end, _ = env.step(action)
            
            next_action_probs = epsilon_greedy_policy(Q, epsilon, next_state, env.nA)
            next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
            
            best_greedy_action = np.argmax(Q[next_state][:])
            delta = reward + discount*Q[next_state][best_greedy_action] - Q[state][action]
            E[state][action] += 1
            
            for s in E.keys():
                for a in E[s]:
                    Q[s][a] += alpha*delta*E[s][a]
                    
                    if next_action == best_greedy_action:
                        E[s][a] = discount*lbda*E[s][a]
                    else:
                        E[s][a] = 0
                    
            if end:
                break
                
            state = next_state
            action = next_action
            
    return Q    

In [5]:
Q = online_q_learning_lambda(env, num_episodes=10000, debug=True)

Episode 1000/10000.
Episode 2000/10000.
Episode 3000/10000.
Episode 4000/10000.
Episode 5000/10000.
Episode 6000/10000.
Episode 7000/10000.
Episode 8000/10000.
Episode 9000/10000.
Episode 10000/10000.


In [6]:
Q

defaultdict(<function __main__.<lambda>>,
            {0: array([ 0.,  0.,  0.,  0.]),
             1: array([-1.   , -2.5  , -1.875,  0.   ]),
             2: array([-2.49999952, -2.99983215, -2.        , -1.5       ]),
             3: array([-2.99978638, -2.99218732, -2.        , -2.99999999]),
             4: array([-1. , -2. , -1. , -1.5]),
             5: array([-1.        , -2.99999988, -3.        , -2.9765625 ]),
             6: array([-2.484375, -2.      , -2.625   , -2.8125  ]),
             7: array([-3.        , -2.        , -1.        , -2.99981689]),
             8: array([-2.        , -3.        , -4.        , -2.57666016]),
             9: array([-2.       , -2.       , -2.3671875, -2.625    ]),
             10: array([-2.99993892, -1.        , -1.        , -2.953125  ]),
             11: array([-2., -1.,  0.,  0.]),
             12: array([-3.        , -3.        , -3.99451293, -3.        ]),
             13: array([-2.99964142, -2.        , -2.55859375, -2.953125  ]),
