# Development of TD($\lambda$) learner algorithm

In [1]:
from platform import python_version
python_version()

'3.7.4'

In [2]:
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### Based on Chapter 12: Eligibility Traces from Sutton & Barto

### TD($\lambda$)

<IMG SRC="images/tdlambda.png">

### Example 7.1 Random walk

In [3]:
from randomwalk import RandomWalkGame

In [4]:
game = RandomWalkGame(size=19, terminal_rewards={'T1': -1.0, 'T2': 1.0})

In [5]:
game.show_state()

T1 _ _ _ _ _ _ _ _ _ J _ _ _ _ _ _ _ _ _ T2


In [6]:
game.make_move((1, 'r'))
game.show_state()

T1 _ _ _ _ _ _ _ _ _ _ K _ _ _ _ _ _ _ _ T2


In [7]:
game.get_rewards()

{1: 0.0}

## Objective

For verification purposes, we want to replicate the results below which should match Figure 12.8 from the book which looks like this:

<IMG SRC="images/fig_12_8.png">    

## N-Step TD Update

In [30]:
def discounted_sequence_generator(sequence, gamma):
    """Returns a generator that yields each item of sequence
    discounted by gamma at each time-step.
    
    Example:
    >>> list(discounted_sequence_generator([10.0]*5, 0.75))
    [10.0, 7.5, 5.625, 4.21875, 3.1640625]
    """
    
    x = 1.0
    for item in sequence:
        yield x*item
        x = x*gamma


def td_n_step_update(value_function, prev_states, prev_rewards, tau, gamma,
                     t_max=np.inf, show=False):
    """Updates the value in value_function for the past state
    that occurred in timestep tau using the n-step TD
    algorithm.
    
    Note:
    tau = t - n + 1

    Where:
        t is the current timestep
        n is the lambda value  

    If tau < 0 no update is possible so none is made.

    Args:
        value_function (dict): Dictionary of state values.
        prev_states (list): List of previous states.
        prev_rewards (list): List of previous rewards.
        tau (int): The timestep of the state that will have
            its value updated.
        gamma (float): Discount rate.
        t_max (int or np.inf): The maximum timestep to include.  
            Set to np.inf or a high number if you want to
            include all timesteps from tau to tau + n.  Set
            to t if the game has eneded.
    """

    assert len(prev_states) == len(prev_rewards)

    if tau >= 0:

        assert tau < len(prev_states), "Not enough past states."

        discounted_rewards = discounted_sequence_generator(
            prev_rewards[tau+1:min(tau + n, t_max) + 1], 
            gamma
        )
        g = sum(list(discounted_rewards))

        if tau + n < t_max:
            g += gamma**n * value_function[prev_states[tau + n]]

        # Update value of state at timestep tau
        state_key = prev_states[tau]
        state_value = value_function[state_key]

        if show:
            print(f"prev_states[{tau}]: {state_value}")

        value_function[state_key] = state_value + \
            learning_rate * (g - state_value)

        if show:
            print(f"value_function[{state_key.__repr__()}]: {value_function[state_key]}")


## TD($\lambda$)

In [97]:
class ValueApproxFunction():
    
    def __init__(self, game):

        self.states = [s for s in game.states if not s in game.terminal_states]
        self.terminal_states = game.terminal_states  # TODO: Should save state keys here
        self.n_states = len(self.states)
        self.weights = np.zeros(2)  # Linear function approximator
        self.z = np.full_like(self.weights, 0)  # Eligibility trace vector

    def input_value(self, state_key):
        """Convert discrete state into a linear feature
        value
        
        returns:
            x (float): (0 <= x <= 1).
        """
        return self.states.index(state_key) / (self.n_states - 1)
    
    def value(self, state_key):
        if state_key in self.terminal_states:
            return 0.0
        x = self.input_value(state_key)
        return self.weights[0] + self.weights[1] * x

    def dw(self, state_key):
        """Partial derivatives of value function w.r.t.
        weights at given state."""
        if state_key in self.terminal_states:
            return np.array([1.0, 0.0])        
        x = self.input_value(state_key)
        return np.array([1.0, x])

value_function = ValueApproxFunction(game)
pd.Series({s: value_function.input_value(s) for s in value_function.states})

A    0.000000
B    0.055556
C    0.111111
D    0.166667
E    0.222222
F    0.277778
G    0.333333
H    0.388889
I    0.444444
J    0.500000
K    0.555556
L    0.611111
M    0.666667
N    0.722222
O    0.777778
P    0.833333
Q    0.888889
R    0.944444
S    1.000000
dtype: float64

In [98]:
value_function.weights

array([0., 0.])

In [99]:
value_function.value('J')

0.0

In [100]:
value_function.dw('J')

array([1. , 0.5])

In [109]:
def td_lambda_update(value_function, prev_state_key, state_key, reward, t, lam, gamma, show=False):
    """Updates the value in value_function for the past state
    using the TD-Lambda algorithm.
    
    Args:
        value_function (ValueApproxFunction): Value function approximator.
        state (str): State key.
        reward (float): Reward value.
        z (array): Eligibility trace vector.
        t (int): The timestep of the state that will have
            its value updated.
        gamma (float): Discount factor.
    """

    value_function.z = gamma * lam * value_function.z + value_function.dw(state_key)
    current_state_value = value_function.value(state_key)
    prev_state_value = value_function.value(prev_state_key)
    td_error = reward + gamma * prev_state_value - current_state_value
    value_function.weights += learning_rate * td_error * value_function.z

    if show:
        print(f"value_function.value({state_key.__repr__()}): {current_state_value}")

# Test 1 - Walk Right

## (a) N-Step TD Updates

In [110]:
# General TD parameters
gamma = 1.0
learning_rate = 0.1
initial_value = 0.0

# N-step TD parameters
n = 10
value_function = {state_key: initial_value for state_key in game.states}

# Initialize agent
state_key = game.generate_state_key(game.state, role)
prev_states = [state_key]
prev_rewards = [None]

# Environment setup
game = RandomWalkGame(size=19, terminal_rewards={'T1': -1.0, 'T2': 1.0})
actions = ['r']*10
role = 1

t = 0
while not game.game_over:

    # Behaviour policy
    move = (role, actions[t])
    
    # Save chosen after-state for learning updates later
    next_state = game.next_state(game.state, move)
    next_state_key = game.generate_state_key(next_state, role)

    game.make_move(move)
    assert game.state == next_state
    
    # Get rewards
    if not game.game_over:
        reward = game.get_rewards()[role]
    else:
        reward = game.get_terminal_rewards()[role]

    state_key = game.generate_state_key(game.state, role)
    prev_states.append(state_key)
    prev_rewards.append(reward)

    if not game.game_over:
        # Update the value for state in timestep tau
        tau = t - n + 1
        td_n_step_update(value_function, prev_states, prev_rewards, tau, gamma)

    else:
        # Complete final state-value updates for timesteps tau
        # to current (terminal) timestep
        T = t + 1
        for tau in range(t - n + 1, T):
            td_n_step_update(value_function, prev_states, prev_rewards, tau, gamma,
                             t_max=T)

    # Update timestep
    t += 1


assert game.game_over

pd.Series(value_function)

T1    0.0
A     0.0
B     0.0
C     0.0
D     0.0
E     0.0
F     0.0
G     0.0
H     0.0
I     0.0
J     0.0
K     0.1
L     0.1
M     0.1
N     0.1
O     0.1
P     0.1
Q     0.1
R     0.1
S     0.1
T2    0.1
dtype: float64

## (b) TD($\lambda$)

In [111]:
# General TD parameters
gamma = 1.0
learning_rate = 0.1
initial_value = 0.0

# TD-Lambda parameters
value_function = ValueApproxFunction(game)
lam = 0.5

# Initialize agent
state_key = game.generate_state_key(game.state, role)
prev_states = [state_key]
prev_rewards = [None]

In [113]:
# Environment setup
game = RandomWalkGame(size=19, terminal_rewards={'T1': -1.0, 'T2': 1.0})
actions = ['r']*10
role = 1

t = 0
while not game.game_over:

    # Behaviour policy
    move = (role, actions[t])
    game.make_move(move)

    # Get rewards
    #reward = game.get_rewards()[1]
    if not game.game_over:
        reward = game.get_rewards()[role]
    else:
        reward = game.get_terminal_rewards()[role]

    state = game.generate_state_key(game.state, role)
    prev_state = prev_states[-1]
    prev_states.append(state_key)
    prev_rewards.append(reward)

    if not game.game_over:
        # Update the value for state in timestep tau
        td_lambda_update(value_function, prev_state, state, reward, t, lam, gamma)

    else:
        # Complete state-value update for final timestep (to terminal state)
        T = t + 1
        td_lambda_update(value_function, prev_state, state, reward, T, lam, gamma)

    # Update timestep
    t += 1


assert game.game_over

values = {state: value_function.value(state) for state in game.states}
pd.Series(values)

T1    0.000000
A     0.223091
B     0.227498
C     0.231905
D     0.236312
E     0.240719
F     0.245126
G     0.249533
H     0.253940
I     0.258346
J     0.262753
K     0.267160
L     0.271567
M     0.275974
N     0.280381
O     0.284788
P     0.289195
Q     0.293602
R     0.298008
S     0.302415
T2    0.000000
dtype: float64