In [9]:
import numpy as np

In [10]:
class Environment:
    def __init__(self, size, goal_state, rewards):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']
        self.action_space = len(self.actions)
        self.current_state = 0

    def reset(self):
        self.current_state = 0
        return self.current_state

    def get_next_state(self, state, action):
        row, col = divmod(state, self.size[1])
        if action == 'up':
            row = max(row - 1, 0)
        elif action == 'down':
            row = min(row + 1, self.size[0] - 1)
        elif action == 'left':
            col = max(col - 1, 0)
        elif action == 'right':
            col = min(col + 1, self.size[1] - 1)
        return row * self.size[1] + col

    def get_reward(self, state):
        return self.rewards.get(state, -1)
    
    def step(self, action):
        next_state = self.get_next_state(self.current_state, action)
        reward = self.get_reward(next_state)
        done = self.is_done(next_state)
        self.current_state = next_state
        return next_state, reward, done, {}
    
    def is_done(self, state):
        return state == self.goal_state

In [11]:
class Parameters:
    def __init__(self, size, goal_state, rewards, gamma):
        self.size = size
        self.goal_state = goal_state
        self.rewards = rewards
        self.gamma = gamma

In [12]:
class MonteCarloOffPolicyAgent:
    def __init__(self, environment, gamma=0.9, epsilon=0.1):
        self.environment = environment
        self.gamma = gamma
        self.epsilon = epsilon
        self.state_action_values = np.zeros((np.prod(environment.size), environment.action_space))
        self.C = np.zeros((np.prod(environment.size), environment.action_space))
        self.policy = np.zeros((np.prod(environment.size), environment.action_space))
        self.initialize_policy()

    def initialize_policy(self):
        for state in range(np.prod(self.environment.size)):
            best_action = np.random.choice(self.environment.action_space)
            self.policy[state][best_action] = 1.0

    def select_action(self, state, policy):
        return np.random.choice(self.environment.action_space, p=policy[state])

    def generate_episode(self, policy, max_steps=100):
        episode = []
        state = self.environment.reset()
        for _ in range(max_steps):
            action = self.select_action(state, policy)
            next_state, reward, done, _ = self.environment.step(self.environment.actions[action])
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        return episode

    def monte_carlo_off_policy(self, episodes=20):
        b_policy = np.ones((np.prod(self.environment.size), self.environment.action_space)) * self.epsilon / self.environment.action_space
        for state in range(np.prod(self.environment.size)):
            best_action = np.random.choice(self.environment.action_space)
            b_policy[state][best_action] += 1 - self.epsilon
    
        for _ in range(episodes):
            episode = self.generate_episode(b_policy)
            G = 0
            W = 1
            for t in reversed(range(len(episode))):
                state, action, reward = episode[t]
                G = self.gamma * G + reward
                self.C[state][action] += W
                self.state_action_values[state][action] += (W / self.C[state][action]) * (G - self.state_action_values[state][action])
                best_action = np.argmax(self.state_action_values[state])
                self.policy[state] = np.zeros(self.environment.action_space)
                self.policy[state][best_action] = 1.0
                if action != best_action:
                    break
                W = W / (b_policy[state][action] + 1e-10)

    def find_best_path_for_goal(self, start_state):
        state = start_state
        path = [state]
        action_map = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}
        while True:
            action_idx = np.argmax(self.policy[state])
            action = action_map[action_idx]
            next_state, reward, done, _ = self.environment.step(action)
            path.append(next_state)
            if done:
                break
            state = next_state
        return path

In [13]:
param = Parameters((3, 3), 8, {8: 10, 3: -5}, 0.9)
environment = Environment(param.size, param.goal_state, param.rewards)
agent = MonteCarloOffPolicyAgent(environment, param.gamma)

agent.monte_carlo_off_policy()

print("State Values:")
print(agent.state_action_values.max(axis=1).reshape(param.size))
print("\nPolicy:")
for row in range(param.size[0]):
    for col in range(param.size[1]):
        state = row * param.size[1] + col
        if state == param.goal_state:
            print(" G ", end=" ")
        else:
            action_idx = np.random.choice(range(len(environment.actions)), p=agent.policy[state])
            print(environment.actions[action_idx], end=" ")
    print()

start_state = 0
best_path = agent.find_best_path_for_goal(start_state)
print("\nBest Path from state 0 to goal:")
print(best_path)

State Values:
[[ 4.58  6.2   0.  ]
 [ 0.    8.    6.2 ]
 [ 8.   10.    0.  ]]

Policy:
right down left 
up down left 
right right  G  

Best Path from state 0 to goal:
[0, 8]
