In [1]:
import numpy as np

class TD:
    def __init__(self, states, alpha=0.1, gamma=0.9):
        """
        Initialize the TD(0) algorithm.
        :param states: List of all possible states.
        :param alpha: Learning rate.
        :param gamma: Discount factor.
        """
        self.states = states
        self.alpha = alpha
        self.gamma = gamma
        self.V = {state: 0 for state in states}  # Initialize value function

    def update(self, state, reward, next_state):
        """
        Perform a single TD(0) update.
        :param state: Current state.
        :param reward: Reward received after transitioning from the current state.
        :param next_state: Next state.
        """
        td_target = reward + self.gamma * self.V[next_state]  # TD target
        td_error = td_target - self.V[state]  # TD error
        self.V[state] += self.alpha * td_error  # Update value

    def get_value(self, state):
        """
        Get the value of a state.
        :param state: The state for which the value is requested.
        :return: Value of the state.
        """
        return self.V.get(state, 0)

# Example Usage
if __name__ == "__main__":
    # Define states
    states = ['A', 'B', 'C', 'D', 'Terminal']

    # Initialize TD(0) agent
    td_agent = TD(states)

    # Simulate episodes
    episodes = [
        [('A', 1, 'B'), ('B', 1, 'C'), ('C', 1, 'Terminal')],
        [('A', 1, 'B'), ('B', 1, 'Terminal')],
    ]

    for episode in episodes:
        for state, reward, next_state in episode:
            td_agent.update(state, reward, next_state)

    # Print the learned value function
    for state in states:
        print(f"Value of {state}: {td_agent.get_value(state):.2f}")

Value of A: 0.20
Value of B: 0.19
Value of C: 0.10
Value of D: 0.00
Value of Terminal: 0.00
