In [6]:
import numpy as np

In [7]:
class Environment:
    def __init__(self, size, goal_state, rewards):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']
        self.action_space = len(self.actions)
        self.current_state = 0

    def reset(self):
        self.current_state = 0
        return self.current_state

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)
        return row * self.size[1] + col

    def get_reward(self, state):
        return self.rewards.get(state, -1)
    
    def step(self, action):
        next_state = self.get_next_state(self.current_state, action)
        reward = self.get_reward(next_state)
        done = self.is_done(next_state)
        self.current_state = next_state
        return next_state, reward, done, {}
    
    def is_done(self, state):
        return state == self.goal_state

In [8]:
class Parameters:
    def __init__(self, size, goal_state, rewards, gamma):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.gamma = gamma

In [9]:
class MonteCarloOnPolicyAgent:
    def __init__(self, environment, gamma=0.9, epsilon=0.1):
        self.environment = environment
        self.gamma = gamma
        self.epsilon = epsilon
        self.policy = np.ones((np.prod(environment.size), environment.action_space)) * epsilon / environment.action_space
        self.state_action_values = np.zeros((np.prod(environment.size), environment.action_space))
        self.returns = {(state, action): [] for state in range(np.prod(environment.size)) for action in range(environment.action_space)}
        self.initialize_policy()

    def initialize_policy(self):
        for state in range(np.prod(self.environment.size)):
            best_action = np.random.choice(self.environment.action_space)
            self.policy[state][best_action] += 1 - self.epsilon

    def select_action(self, state):
        return np.random.choice(self.environment.action_space, p=self.policy[state])

    def generate_episode(self, max_steps=100):
        episode = []
        state = self.environment.reset()
        for _ in range(max_steps):
            action = self.select_action(state)
            next_state, reward, done, _ = self.environment.step(self.environment.actions[action])
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        return episode

    def monte_carlo_on_policy(self, episodes=20):
        for _ in range(episodes):
            episode = self.generate_episode()
            G = 0
            visited_state_action_pairs = set()
            for t in reversed(range(len(episode))):
                state, action, reward = episode[t]
                G = self.gamma * G + reward
                if (state, action) not in visited_state_action_pairs:
                    visited_state_action_pairs.add((state, action))
                    self.returns[(state, action)].append(G)
                    self.state_action_values[state][action] = np.mean(self.returns[(state, action)])
                    best_action = np.argmax(self.state_action_values[state])
                    for a in range(self.environment.action_space):
                        if a == best_action:
                            self.policy[state][a] = 1 - self.epsilon + self.epsilon / self.environment.action_space
                        else:
                            self.policy[state][a] = self.epsilon / self.environment.action_space

    def find_best_path_for_goal(self, start_state):
        state = start_state
        path = [state]
        action_map = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}
        while True:
            action_idx = np.argmax(self.policy[state])
            action = action_map[action_idx]
            next_state, reward, done, _ = self.environment.step(action)
            path.append(next_state)
            if done:
                break
            state = next_state
        return path

In [10]:
param = Parameters((3, 3), 8, {8: 10, 3: -5}, 0.9)
environment = Environment(param.size, param.goal_state, param.rewards)
agent = MonteCarloOnPolicyAgent(environment, param.gamma)

agent.monte_carlo_on_policy()

print("State Values:")
print(agent.state_action_values.max(axis=1).reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            action_idx = np.random.choice(range(len(environment.actions)), p=agent.policy[state])
            print(environment.actions[action_idx], end=" ")
    print()

start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

State Values:
[[-1.30579731 -0.478       8.        ]
 [ 5.70547368  0.58       10.        ]
 [ 7.90526316 10.          0.        ]]

Policy:
down down down 
down left down 
left right  G  

Best Path from state 0 to goal:
[0, 8]
