In [129]:
import random
import numpy as np

In [130]:
class Environment:
    def __init__(self, size, goal_state, rewards):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']
        self.action_space = len(self.actions)
        self.current_state = 0

    def reset(self):
        self.current_state = 0
        return self.current_state

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)
        return row * self.size[1] + col

    def get_reward(self, state):
        return self.rewards.get(state, -1)
    
    def step(self, action):
        next_state = self.get_next_state(self.current_state, action)
        reward = self.get_reward(next_state)
        done = self.is_done(next_state)
        self.current_state = next_state
        return next_state, reward, done, {}
    
    def is_done(self, state):
        return state == self.goal_state

In [131]:
class Parameters:
    def __init__(self, size, goal_state, rewards, gamma):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.gamma = gamma

In [132]:
class MonteCarloESAgent:
    def __init__(self, environment, gamma=0.9, epsilon=0.1, alpha=0.1):
        self.environment = environment
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.policy = np.random.choice(environment.action_space, size=np.prod(environment.size))
        self.state_action_values = np.zeros((np.prod(environment.size), environment.action_space))
        self.returns = {(state, action): [] for state in range(np.prod(environment.size)) for action in range(environment.action_space)}
        self.initialize_policy()

    def initialize_policy(self):
        for state in range(np.prod(self.environment.size)):
            self.policy[state] = np.random.choice(self.environment.action_space)

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.environment.action_space)
        else:
            return np.argmax(self.state_action_values[state])

    def generate_episode(self, start_state=None, start_action=None, max_steps=100):
        episode = []
        state = start_state if start_state is not None else self.environment.reset()
        action = start_action if start_action is not None else self.select_action(state)
        
        for _ in range(max_steps):
            next_state, reward, done, _ = self.environment.step(self.environment.actions[action])
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
            action = self.select_action(state)
        
        return episode

    def monte_carlo_es(self, episodes=20):
        for _ in range(episodes):
            start_state = np.random.choice(np.prod(self.environment.size))
            start_action = np.random.choice(self.environment.action_space)
            episode = self.generate_episode(start_state, start_action)
            G = 0
            for t in reversed(range(len(episode))):
                state, action, reward = episode[t]
                G = self.gamma * G + reward
                if not (state, action) in [(x[0], x[1]) for x in episode[:t]]:
                    self.returns[(state, action)].append(G)
                    self.state_action_values[state][action] = np.mean(self.returns[(state, action)])
                    self.policy[state] = np.argmax(self.state_action_values[state])

    def find_best_path_for_goal(self, start_state):
        state = start_state
        path = [state]
        action_map = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}
        while True:
            action_idx = self.policy[state]
            action = action_map[action_idx]
            next_state, reward, done, _ = self.environment.step(action)
            path.append(next_state)
            if done:
                break
            state = next_state
        return path

In [133]:
param = Parameters((3, 3), 8, {8: 10, 3: -5}, 0.9)
environment = Environment(param.size, param.goal_state, param.rewards)
agent = MonteCarloESAgent(environment, param.gamma)

agent.monte_carlo_es()

print("State Values:")
print(agent.state_action_values.max(axis=1).reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            action_idx = agent.policy[state]
            print(environment.actions[action_idx], end=" ")
    print()

start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

State Values:
[[-5.81165954 -0.93034295 -2.42521805]
 [-1.08098489 -0.96465175 10.        ]
 [-3.55891088 10.          0.        ]]

Policy:
up up right 
up right down 
down right  G  

Best Path from state 0 to goal:
[0, 5, 8]
