In [24]:
import numpy as np

In [25]:
# Grid-world dimensions
grid_size = 3
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate

In [26]:
# Initialize state values
V = np.zeros((grid_size, grid_size))

# Randomly select a goal state (not the start state)
start_state = (2, 0)
goal_state = (np.random.randint(0, grid_size), np.random.randint(0, grid_size))
while goal_state == start_state:
    goal_state = (np.random.randint(0, grid_size), np.random.randint(0, grid_size))

# Reward setup
rewards = np.zeros((grid_size, grid_size))
rewards[goal_state] = 1

# Actions: up, down, left, right
actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]

In [27]:
def is_valid_state(state):
    return 0 <= state[0] < grid_size and 0 <= state[1] < grid_size

In [28]:
def take_action(state, action):
    next_state = (state[0] + action[0], state[1] + action[1])
    if is_valid_state(next_state):
        return next_state
    return state

In [29]:
# Training loop
for episode in range(100):
    state = start_state  # Start at bottom-left
    while state != goal_state:
        # Choose an action randomly
        action = actions[np.random.choice(len(actions))]
        next_state = take_action(state, action)
        
        # TD(0) Update
        reward = rewards[next_state]
        V[state] += alpha * (reward + gamma * V[next_state] - V[state])
        
        # Move to the next state
        state = next_state

In [30]:
# Print the results
print("Goal State:", goal_state)
print("State Value Estimates:")
print(np.round(V, 2))

Goal State: (1, 2)
State Value Estimates:
[[0.3  0.38 0.72]
 [0.29 0.49 0.  ]
 [0.26 0.41 0.67]]
