# 43008: Reinforcement Learning

## Week 6: Monte Carlo Methods - Control:
* MC Algorithm for finding optimal policy - Model free approach

### What you will learn?
* GLIE-MC Algorithm for finding optimal policy for Gym based environments

### Import Libraries

In [None]:
!pip install gymnasium

In [None]:
import gymnasium as gym
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import sys

### Helper function for visualisation

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_blackjack_values(Q):
    """
    Given a Q-table, this function plots the state-value function for Blackjack,
    aligning with the representation in Sutton and Barto's book.

    Args:
    - Q (dict): The Q-table.
    """

    V = {}  # For storing the state-value function

    # Populate the V dictionary with state-value pairs
    for k, v in Q.items():
        # Check if v is a dictionary (i.e., nested dictionary structure)
        if isinstance(v, dict):
            values = list(v.values())
        else:
            values = v

        # If the values list has only one entry, pad it with zero to make it a two-element list
        if len(values) == 1:
            values.append(0.0)

        # Compute the state-value based on a policy:
        # If player's current sum is greater than 18, the policy is 80% stick and 20% hit.
        # Otherwise, the policy is 20% stick and 80% hit.
        if k[0] > 18:
            V[k] = np.dot([0.8, 0.2], values)
        else:
            V[k] = np.dot([0.2, 0.8], values)

    # Helper function to get Z values for plotting
    def get_Z(x, y, usable_ace):
        # Return the state value if the state exists in V, otherwise return 0
        if (x, y, usable_ace) in V:
            return V[(x, y, usable_ace)]
        else:
            return 0

    # Helper function to generate the 3D surface plot for a given usable ace condition
    def get_figure(usable_ace, ax):
        # Define the range for player's sum and dealer's card
        x_range = np.arange(12, 22)  # Player's sum: 12 through 21
        y_range = np.arange(10, 0, -1)  # Dealer's card: 10 through Ace

        # Create a meshgrid for plotting
        X, Y = np.meshgrid(x_range, y_range)

        # Calculate Z values using the get_Z function
        Z = np.array([get_Z(x, y, usable_ace) for x, y in zip(np.ravel(X), np.ravel(Y))]).reshape(X.shape)

        # Plot the surface with the provided colormap
        surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=plt.cm.coolwarm, vmin=-1.0, vmax=1.0)

        # Label the axes
        ax.set_xlabel("Player's Current Sum")
        ax.set_ylabel("Dealer's Showing Card")
        ax.set_yticks(np.arange(1, 11))
        ax.set_yticklabels(['10', '9', '8', '7', '6', '5', '4', '3', '2', 'A'])  # Reverse order for y-axis labels
        ax.set_zlabel('State Value')

        # Set the viewing angle to give the desired orientation
        ax.view_init(ax.elev, -120)

    # Create the 3D plot
    fig = plt.figure(figsize=(20, 20))

    # Plot for the case with a usable ace
    ax = fig.add_subplot(211, projection='3d')
    ax.set_title('Usable Ace')
    get_figure(True, ax)

    # Plot for the case without a usable ace
    ax = fig.add_subplot(212, projection='3d')
    ax.set_title('No Usable Ace')
    get_figure(False, ax)

    # Display the plots
    plt.show()


In [None]:
def plot_blackjack_policy_from_Q(Q):
    """
    Plot the Blackjack policy derived from Q values.

    Args:
    - Q (dict): The Q-table, mapping states to action-value pairs.
    """

    # Define the range for dealer's card and player's sum
    x_range = np.arange(1, 11)  # Dealer's card: Ace through 10
    y_range = np.arange(12, 22)  # Player's sum: 12 through 21

    # Create two 2D matrices to store policy data for usable and non-usable ace scenarios.
    policy_data_usable = np.zeros((len(y_range), len(x_range)))
    policy_data_non_usable = np.zeros((len(y_range), len(x_range)))

    # Derive policy from Q values
    for state, actions in Q.items():
        player_sum, dealer_card, usable_ace = state
        # Get the action with the maximum Q value
        best_action = np.argmax(actions)
        if usable_ace:
            policy_data_usable[player_sum - 12, dealer_card - 1] = best_action
        else:
            policy_data_non_usable[player_sum - 12, dealer_card - 1] = best_action

    fig, axes = plt.subplots(nrows=2, figsize=(10, 20))

    # Plot the policy for states with a usable ace
    axes[0].matshow(policy_data_usable, cmap=plt.cm.coolwarm, extent=[0.5, 10.5, 21.5, 11.5])
    axes[0].set_title('Usable Ace')
    axes[0].invert_yaxis()  # Invert the y-axis

    # Plot the policy for states without a usable ace
    axes[1].matshow(policy_data_non_usable, cmap=plt.cm.coolwarm, extent=[0.5, 10.5, 21.5, 11.5])
    axes[1].set_title('No Usable Ace')
    axes[1].invert_yaxis()  # Invert the y-axis

    for ax in axes:
        ax.set_xticks(np.arange(1, 11))
        ax.set_yticks(np.arange(12, 22))
        ax.set_yticklabels(list(range(12, 22)))  # Corrected player sums labels
        ax.set_xlabel("Dealer's Showing Card")
        ax.set_ylabel("Player's Current Sum")

    plt.tight_layout()
    plt.show()


## GLIE MC-Control Algorithm


<img src='https://drive.google.com/uc?id=1kY0AdRSrCyZHRyWEo0wnjmLoAj9qCU9W' height=400>


##**Steps**:

1.   Write a function for Implementing Epsilon-Greedy Policy
2.   Write a function for creating Episodes
3.   Write a function for Updating Q-Values
4.   Combine all the functions to complete the GLIE Control Algorithm



### 1. Implement Epsilon Greedy Policy

In [None]:
def epsilon_greedy_policy(state, Q, epsilon):
    """Select an action using epsilon-greedy policy."""
    if state not in Q:
        Q[state] = [0, 0]

    # Randomly choose an action with probability epsilon (Exploration)
    if ## WRITE YOUR CODE HERE ##
        return ## WRITE YOUR CODE HERE ##
    # Choose the action with the highest Q-value for the current state (Exploitation)
    else:
        return ## WRITE YOUR CODE HERE ##

### 2. Implement Generate Episode function

In [None]:
# Generate Episode function
def generate_episode(env, Q, epsilon):
    """Generate an episode following the epsilon-greedy policy."""

    # Intialize the Episode
    episode = ## WRITE YOUR CODE HERE ##
    # Reset environment to starting state
    state = ## WRITE YOUR CODE HERE ##
    done = ## WRITE YOUR CODE HERE ##

    # Continue until the episode is done
    while not done:
        # Select an action based on epsilon-greedy policy
        action = ## WRITE YOUR CODE HERE ##

        # Take the action and observe the next state and reward
        next_state, reward, done, _ = ## WRITE YOUR CODE HERE ##

        # Append the state, action, and reward to the episode list
        episode.append ## WRITE YOUR CODE HERE ##
        state = ## WRITE YOUR CODE HERE ##

    return episode

### 3. Implement Q-Value Update

In [None]:
def update_Q_values(episode, Q, N, gamma):
    """Update Q-values using the generated episode."""
    # Initialize the return (G_t) to zero
    G_t = ## WRITE YOUR CODE HERE ##
    # Iterate over the episode in reverse order
    for state, action, reward in reversed(episode):
        # Ensure the state is initialized in Q and N
        if state not in Q:
            Q[state] = [0, 0]
        if state not in N:
            N[state] = [0, 0]

        # Calculate the cumulative discounted reward
        G_t = ## WRITE YOUR CODE HERE ##
        # Increment the count for the state-action pair
        N[state][action] += ## WRITE YOUR CODE HERE ##

        # Calculate the step-size (alpha = 1/N(St,At))
        alpha = 1 / N[state][action]

        # Incrementally update the Q-value using the return (G_t)
        Q[state][action] += ## WRITE YOUR CODE HERE ##

### 4. Implement GLIE MC Control

In [None]:
def glie_mc_control(env, episodes, gamma, epsilon, decay):
    """Train the agent using GILE MC Control."""
    # Initialize Q-values with zeros for each state-action pair
    Q = ## WRITE YOUR CODE HERE ##
    # Initialize N (counts) with zeros for each state-action pair
    N = ## WRITE YOUR CODE HERE ##
    # Iterate over the specified number of episodes
    for k in range(1, episodes + 1):
        # Generate an episode
        episode = ## WRITE YOUR CODE HERE ##
        print(f"Episode {k}/{episodes}, Current epsilon: {epsilon}")
        sys.stdout.flush()
        # Update Q-values using the episode
        ## WRITE YOUR CODE HERE ##
        # Decay epsilon to reduce exploration over time
        epsilon *= ## WRITE YOUR CODE HERE ##

    # After all episodes are processed, derive the policy from the Q-values.
    policy = {}
    for state, actions in Q.items():
        policy[state] = ## WRITE YOUR CODE HERE ##
    print(f"Derived policy for {len(policy)} states.")  # Debug: Print the number of states for which policy is derived
    return Q, policy

### 5. Solve the BlackJack environment using the GLIE MC Control Algorithm

In [None]:
# Create the Blackjack environment
env = ## WRITE YOUR CODE HERE ##

# Set hyperparameters
gamma = 0.9  # Discount factor
epsilon = 1.0  # Initial exploration factor
decay = 0.9995  # Decay rate for epsilon
episodes = 50000  # Number of training episodes

# Train the agent
Q, policy = ## WRITE YOUR CODE HERE ##

# Close the environment
env.close()


#### Check the results: Optimal Policy and Q-Values

In [None]:
print("\nOptimal Policy:", policy)
print("\nOptimal Q-values:\n", Q)

#### Plot the graph of optimal Q Values

In [None]:
plot_blackjack_values(Q)

#### Plot the graph of optimal Policy

In [None]:
plot_blackjack_policy_from_Q(Q)