## GLIE MC Control Algorithm

- GLIE $\; \rightarrow \;$ Greedy in the Limit with Infinite Exploration
- MC $\; \rightarrow \;$ Monte Carlo

Resources used while writing this notebook:
- [Nimish Sanghi's book on Deep RL](https://www.amazon.com/Deep-Reinforcement-Learning-Python-TensorFlow/dp/1484268083)
- Tawsif Kamal's videos on Blackjack Monte Carlo Reinforcement Learning [part 1](https://youtu.be/NeusGkowXR4?si=9a1aE_bInK4vSAHw) and [part 2](https://youtu.be/wn8hlPNwL74?si=PV_h3WQCXmZKwRzW).
- [Gymnasium docs for Blackjack](https://gymnasium.farama.org/environments/toy_text/blackjack/).

In [None]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm

In [None]:
def epsilon_soft_policy(env, Q, state, epsilon):
    probs = np.zeros(2)
    # since there are only two actions: 0 and 1
    # so, optimal action is either 0 or 1
    # and, sub-optimal action is the other one 
    optimal_action = np.argmax(Q[state])
    sub_optimal_action = np.abs(optimal_action - 1)
    probs[optimal_action] = 1 - epsilon + epsilon/env.action_space.n
    probs[sub_optimal_action] = epsilon/env.action_space.n
    action = np.random.choice([0,1], p=probs)
    return action


In [None]:
def generate_episode(env, Q, epsilon):
    state, _ = env.reset()
    episode = []
    while True:
        action = epsilon_soft_policy(Q, state, epsilon)
        next_state, reward, done, trunc, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if (done or trunc):
            break
    return episode


In [None]:
class GLIE_MC_Control:
    def __init__(self, env, num_episodes, epsilon, epsilon_min=0.05, decay_rate=0.9999, gamma=1):
        """
        param env: (gymnasium environment) the environment to run the algorithm on
        param num_episodes: (int) number of episodes to run the algorithm
        param epsilon: (float) 
        param epsilon_min: (float)
        param decay_rate: (float)
        param gamma: (float) discount factor
        """
        self.env = env
        self.num_episodes = num_episodes
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.decay_rate = decay_rate
        self.gamma = gamma
        self.Q = defaultdict(lambda: np.zeros(self.env.action_space.n))
        self.visit_count = defaultdict(lambda: np.zeros(self.env.action_space.n))
        self.policy = defaultdict(lambda: 0)
        self.rewards_all_episodes = []


    def every_visit(self):
        """
        This is the every-visit GLIE Monte Carlo Control algorithm.
        Check pg. 88 of Nimish Sanghi's book for the pseudocode.
        """
        for episode in tqdm(range(1, self.num_episodes+1)):
            experience = generate_episode(self.Q, self.epsilon)
            states, actions, rewards = zip(*experience)
            self.rewards_all_episodes.append(sum(rewards))
            G = 0
            T = len(states)
            for t in range(T-1, -1, -1):
                s, a, r = states[t], actions[t], rewards[t]
                G = self.gamma*G + r
                self.visit_count[s][a] += 1
                self.Q[s][a] = self.Q[s][a] + (1 / self.visit_count[s][a]) * (G - self.Q[s][a])
                for state_value, q_value in self.Q.items():
                    self.policy[state_value] = np.argmax(q_value)
            self.epsilon = self.epsilon/episode


    def first_visit(self):
        """
        This is the first-visit GLIE Monte Carlo Control algorithm.
        """
        pass