In [2]:
import numpy as np

class GridWorld:
    def __init__(self):
        self.size = 4
        self.actions = ['U', 'D', 'L', 'R']
        self.rewards = {(3, 3): 5, (1, 3): -2, (2, 1): -2, (3, 1): -2}
        self.Q = {}
        for i in range(self.size):
            for j in range(self.size):
                self.Q[(i, j)] = {a: 0 for a in self.actions}

    def choose_action(self, state, epsilon):
        if np.random.uniform(0, 1) < epsilon:
            return np.random.choice(self.actions)
        else:
            return max(self.Q[state], key=self.Q[state].get)

    def generate_episode(self, epsilon):
        episode = []
        state = (0, 0)
        while state != (3, 3):
            action = self.choose_action(state, epsilon)
            next_state, reward = self.take_action(state, action)
            episode.append((state, action, reward))
            state = next_state
        return episode

    def take_action(self, state, action):
        if action == 'U':
            next_state = (max(state[0] - 1, 0), state[1])
        elif action == 'D':
            next_state = (min(state[0] + 1, self.size - 1), state[1])
        elif action == 'L':
            next_state = (state[0], max(state[1] - 1, 0))
        elif action == 'R':
            next_state = (state[0], min(state[1] + 1, self.size - 1))
        reward = self.rewards.get(next_state, 0)
        return next_state, reward

    def update_Q(self, episode, gamma):
        G = 0
        for state, action, reward in reversed(episode):
            G = reward + gamma * G
            self.Q[state][action] += 0.1 * (G - self.Q[state][action])

    def train(self, num_episodes=1000, epsilon=0.1, gamma=0.9):
        for _ in range(num_episodes):
            episode = self.generate_episode(epsilon)
            self.update_Q(episode, gamma)

    def print_policy(self):
        policy = np.array([max(self.Q[state], key=self.Q[state].get) for state in self.Q])
        policy = policy.reshape(self.size, self.size)
        print(policy)


if __name__ == "__main__":
    gridworld = GridWorld()
    gridworld.train()
    print("Learned Policy:")
    gridworld.print_policy()


Learned Policy:
[['D' 'D' 'L' 'L']
 ['R' 'U' 'D' 'L']
 ['U' 'R' 'R' 'D']
 ['U' 'L' 'R' 'U']]
