In [1]:
import gym
from gym.envs.registration import register, spec

MY_ENV_NAME='FrozenLakeNonSlippery-v0'
try:
    spec(MY_ENV_NAME)
except:
    register(
        id=MY_ENV_NAME,
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False},
    )
FLenv = gym.make(MY_ENV_NAME)

INFO:gym.envs.registration:Making new env: FrozenLakeNonSlippery-v0
[2016-09-01 09:55:38,771] Making new env: FrozenLakeNonSlippery-v0


### Given a policy, determine its action-value function

In [2]:
from collections import defaultdict
import numpy as np
from copy import deepcopy 
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
policy = defaultdict(lambda: np.ones(FLenv.action_space.n, dtype=float)/FLenv.action_space.n)

In [4]:
def compute_action_value(policy = policy, env = FLenv, gamma = 1, 
                         max_episodes = 10, epsilon = 0.01):

    q = defaultdict(list)
    
    error = np.inf
    errors = []
    for _ in xrange(max_episodes):
        done = False
        state = env.reset()
        states = []
        actions = []
        rewards = []
        returns = []
        
        while not done:
            states.append(state)
            
            action = np.random.multinomial(1, policy[state]).argmax()
            next_state, reward, done, _ = env.step(action)
            state = next_state
            
            actions.append(action)
            rewards.append(reward)
            
        return_so_far = 0
        for reward in reversed(rewards):
            return_so_far = reward + gamma * return_so_far
            returns.append(return_so_far)
        
        # return is computed in reverse direction so correct the order
        returns = returns[::-1]
        
        q_old = deepcopy(q)
        
        for state, action, return_so_far in zip(states, actions, returns):
            q[(state, action)].append(return_so_far)
            
        error = np.max([abs(np.mean(value) - np.mean(q_old.get(key, 0)))
                                  for key, value in q.items()])
        
        errors.append(error)
        
    q = {key:np.mean(value) for key, value in q.items()}
    
    return q #, errors

#### How can we check that the implementation of compute_action_value is correct.

In [66]:
def compute_action_value_theoretically(policy = policy, behavior_policy = policy, env = FLenv, gamma = 1, 
                                       max_episodes = 10, epsilon = 0.01):
    q = defaultdict(int)
    
    for _ in xrange(max_episodes):
        done = False
        state = env.reset()
        
        while not done:
            action = np.random.multinomial(1, behavior_policy[state]).argmax()
            next_state, reward, done, _ = env.step(action)
            
            q[state, action] = reward + gamma * (policy[next_state].dot([q[next_state, a] 
                                                                   for a in xrange(env.action_space.n)]))
            
            state = next_state
    
    return q        

In [7]:
q_grid_world = compute_action_value(max_episodes=2000)

In [8]:
q_grid_world_theoretical = compute_action_value_theoretically(max_episodes=20000)

### computing the difference between theoretical and Every-visit Monte Carlo Based Estimate

In [9]:
diff_array = [abs(q_grid_world.get(key, 0) - value) for key, value in q_grid_world_theoretical.items()]
diff_theo_prac_mean, diff_theo_prac_max  = np.mean(diff_array), np.max(diff_array)

In [10]:
print "The mean difference between the theoretical and practical estimate is {:0.4f}".format(diff_theo_prac_mean)
print "The max difference between the theoretical and practical estimate is {:0.4f}".format(diff_theo_prac_max)

The mean difference between the theoretical and practical estimate is 0.0128
The max difference between the theoretical and practical estimate is 0.1488


### Implementation of retace ($\lambda$)

In [11]:
def q_differences(q_1, q_2):
    diff_array = [abs(q_1.get(key, 0) - value) for key, value in q_2.items()]
    diff_mean, diff_max  = np.mean(diff_array), np.max(diff_array)
    print "The mean difference between the theoretical and practical estimate is {:0.4f}".format(diff_mean)
    print "The max difference between the theoretical and practical estimate is {:0.4f}".format(diff_max)

In [12]:
def compute_traces(states, actions, target_policy, behavior_policy, lambda_):
    traces = []
    
    for state, action in zip(states, actions):
        importace_sampling = target_policy[state][action]/behavior_policy[state][action]
        trace = lambda_*min(importace_sampling, 1)
        traces.append(trace)
        
    return traces

In [13]:
def compute_targets(q, states, actions, rewards, gamma, target_policy, action_n):
    deltas = []
    for state, next_state, action, reward in zip(states[:-1], states[1:], actions[:-1], rewards[:-1]):
        future_reward = target_policy[next_state].dot([q[next_state, a] for a in xrange(action_n)])
        delta = reward + gamma * future_reward - q[state, action]
        deltas.append(delta)
    delta = rewards[-1] - q[states[-1], actions[-1]]
    deltas.append(delta)
    return deltas

In [14]:
def compute_one_n_step(gamma, targets, traces):
    discount = 1
    trace = 1
    delta = 0
    traces = traces + [1]
    for i, target in enumerate(targets):
        delta += discount*trace*target
        discount *= gamma
        trace *= traces[i]
    return delta

In [15]:
def compute_n_step_target(q, states, actions, rewards, gamma, lambda_,
                          target_policy, behavior_policy, action_n, step_n):
    traces = compute_traces(states, actions, target_policy, behavior_policy, lambda_)
    targets = compute_targets(q, states, actions, rewards, gamma, target_policy, action_n)
    n_step_targets = []
    for i in xrange(len(targets)):
        target = compute_one_n_step(gamma, targets[i:i+step_n], traces[i+1:i+step_n])
        n_step_targets.append(target)
    return n_step_targets

In [16]:
def retrace(target_policy, behavior_policy, env, lambda_, step_n, alpha, gamma, max_episodes):
    q = defaultdict(int)
    action_n = env.action_space.n
    
    for itr in xrange(max_episodes):
        done = False
        state = env.reset()
        states = []
        actions = []
        rewards = []
        returns = []
        
        while not done:
            states.append(state)
            
            action = np.random.multinomial(1, behavior_policy[state]).argmax()
            next_state, reward, done, _ = env.step(action)
            state = next_state
            
            actions.append(action)
            rewards.append(reward)
            
        n_step_corrections = compute_n_step_target(deepcopy(q), states, actions, rewards, gamma, lambda_, target_policy, 
                                                   behavior_policy, action_n, step_n) 
                
        for state, action, correction in zip(states, actions, n_step_corrections):
            q[(state, action)] += alpha*correction
            
        #alpha /= (itr +1 ) ** 0.8
        
    return q
        

### Assertions

In [20]:
t_policy = {0 : np.array([0.4, 0.6]), 1 : np.array([0.6, 0.4])}
b_policy = {0 : np.array([0.6, 0.4]), 1 : np.array([0.4, 0.6])}
q = {(0, 0) : 1, 
     (0, 1) : 2,
     (1, 0) : 3,
     (1, 1) : 4}

# Asserting compute_traces
assert(compute_traces([0, 0], [0, 0], t_policy, t_policy, 1) == [1, 1])
assert(np.allclose(compute_traces([0, 0], [0, 0], t_policy, b_policy, 1), [2/3., 2/3.]))
assert(np.allclose(compute_traces([0, 0], [0, 1], t_policy, b_policy, 1), [2/3., 1.]))
assert(np.allclose(compute_traces([1, 0], [0, 1], t_policy, b_policy, 1), [1., 1.]))

# Asserting compute_targets
assert(np.allclose(compute_targets(q, [0, 0], [0, 0], [0, 0], 1, t_policy, 2), [0.6, -1]))
assert(np.allclose(compute_targets(q, [0, 1], [0, 0], [0, 0], 1, t_policy, 2), [2.4, -3]))

# Asserting compute_one_step
assert(np.allclose(compute_one_n_step(1, [1, 2, 3], [1, 2]), 9))
assert(np.allclose(compute_one_n_step(0.1, [1, 2, 3], [1, 2]), 1.26))

# Asserting compute_n_step_target
assert(np.allclose(compute_n_step_target(q, [0, 0], [0, 0], [0, 0], 1, 1, t_policy, b_policy, 2, 1), 
                   compute_targets(q, [0, 0], [0, 0], [0, 0], 1, t_policy, 2)))
tmp_traces = compute_traces([0, 0], [0, 0], t_policy, b_policy, 1)
tmp_targets = compute_targets(q, [0, 0], [0, 0], [0, 0], 1, t_policy, 2)
assert(np.allclose(compute_n_step_target(q, [0, 0], [0, 0], [0, 0], 1, 1, t_policy, b_policy, 2, 2), 
                   [tmp_targets[0] + tmp_traces[1]*tmp_targets[1], tmp_targets[1]]))

### Checking whether the retrace ($\lambda$) works in online setting

In [21]:
retrace_q = retrace(policy, policy, FLenv, lambda_ = 1, step_n = 100, alpha = 0.01, gamma = 1, max_episodes = 40000)

In [22]:
q_differences(q_grid_world_theoretical, retrace_q) 

The mean difference between the theoretical and practical estimate is 0.0001
The max difference between the theoretical and practical estimate is 0.0029


### Checking retace ($\lambda$) in offline setting

In [23]:
a_fixed_policy = defaultdict(lambda: np.array([0.2, 0.2, 0.2, 0.4]))

In [24]:
retrace_q_fixed_policy = retrace(a_fixed_policy, policy, FLenv, lambda_=1, step_n=100, alpha=0.01, gamma=1, 
                                     max_episodes=40000)

In [25]:
theoretical_q_fixed_policy = compute_action_value_theoretically(policy=a_fixed_policy, max_episodes=2000)
every_visit_mc_q_fixed_policy = compute_action_value(policy=a_fixed_policy, max_episodes=10000)

In [26]:
q_differences(theoretical_q_fixed_policy, every_visit_mc_q_fixed_policy) 

The mean difference between the theoretical and practical estimate is 0.0059
The max difference between the theoretical and practical estimate is 0.0762


In [28]:
q_differences(retrace_q_fixed_policy, theoretical_q_fixed_policy) 

The mean difference between the theoretical and practical estimate is 0.0001
The max difference between the theoretical and practical estimate is 0.0041


In [29]:
q_differences(retrace_q_fixed_policy, every_visit_mc_q_fixed_policy) 

The mean difference between the theoretical and practical estimate is 0.0060
The max difference between the theoretical and practical estimate is 0.0763


### Checking retrace ($\lambda$) with optimal policy

#### Finding the optimal policy

In [30]:
from q_learning import TabularQAgent

In [39]:
q_agent = TabularQAgent(FLenv.observation_space, FLenv.action_space, init_std=0, discount=0.9, 
                        n_episodes=2000, eps=0.4)
q_agent.learn(FLenv)
q_agent.accuracy(FLenv, 100)

The algorithm reached to goal 442.0 times in 2000 number of episodes during learning phase.
The average reward in 100 episodes is 100.0


In [54]:
optimal_policy = defaultdict(lambda: np.ones(FLenv.action_space.n, dtype=float)/FLenv.action_space.n)
def feature_encoder(obs, state_space_n):
    state = np.zeros((state_space_n, 1), dtype = float)
    state[obs] = 1.0
    return state.flatten()
optimal_policy.update({key: feature_encoder(int(np.argmax(value)), value.shape[0]) 
                       for key, value in q_agent.q.items()})

#### value of optimal policy

In [67]:
theoretical_q_optimal_policy = compute_action_value_theoretically(policy=optimal_policy, behavior_policy=policy,
                                                                  max_episodes=2000)

In [59]:
every_visit_mc_q_optimal_policy = compute_action_value(policy=optimal_policy, max_episodes=2000)

In [61]:
every_visit_mc_q_optimal_policy

{(0, 1): 1.0,
 (4, 1): 1.0,
 (8, 2): 1.0,
 (9, 2): 1.0,
 (10, 1): 1.0,
 (14, 2): 1.0}

In [62]:
retrace_q_optimal_policy = retrace(optimal_policy, policy, FLenv, lambda_=1, step_n=100, alpha=0.01, gamma=1, 
                                     max_episodes=40000)

In [64]:
{key:retrace_q_optimal_policy[key] for key in every_visit_mc_q_optimal_policy.keys()}

{(0, 1): 0.98937862863198855,
 (4, 1): 0.98942303985976499,
 (8, 2): 0.98950747505041425,
 (9, 2): 0.9900442026851487,
 (10, 1): 0.99270861749730843,
 (14, 2): 0.9958199540519876}

In [70]:
q_differences(theoretical_q_optimal_policy, retrace_q_optimal_policy)

The mean difference between the theoretical and practical estimate is 0.0142
The max difference between the theoretical and practical estimate is 0.0842
