In [None]:
import numpy as np

class OffPolicyMonteCarloControl:
    def __init__(self, num_actions, epsilon=0.1, gamma=1.0):
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.gamma = gamma
        self.Q = np.zeros((num_actions,))

    def epsilon_greedy_policy(self, state):
        return np.random.randint(self.num_actions) if np.random.rand() < self.epsilon else np.argmax(self.Q)

    def update_Q(self, episode):
        G, W = 0, 1
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            self.Q[action] += W / (self.num_actions * self.epsilon + (1 - self.epsilon) * np.sum(self.Q))
            if action != np.argmax(self.Q):
                break
            W *= 1 / (self.epsilon if action == np.argmax(self.Q[state]) else self.num_actions)

    def learn(self, episodes, behavior_policy):
        for _ in range(episodes):
            episode = []
            state = 0
            while True:
                action = behavior_policy(state)
                next_state = np.random.choice(self.num_actions)
                reward = 1 if next_state == self.num_actions - 1 else 0
                episode.append((state, action, reward))
                if next_state == self.num_actions - 1:
                    break
                state = next_state
            self.update_Q(episode)

# Example usage:
num_actions = 4
num_episodes = 1000
epsilon = 0.1
gamma = 1.0

def behavior_policy(state):
    return np.random.randint(num_actions) if np.random.rand() < epsilon else np.argmax(Q)

agent = OffPolicyMonteCarloControl(num_actions, epsilon, gamma)
agent.learn(num_episodes, behavior_policy)
optimal_policy = np.argmax(agent.Q)
print("Optimal Policy:", optimal_policy)






Optimal Policy: 3
