# Implementation of On Policy Control

In [52]:
# import necessary libraries
import gymnasium as gym
import pandas as pd
import random

# Set SEED
SEED = 106

## Define Black Jack Environment

<a href = "https://gymnasium.farama.org/environments/toy_text/blackjack/">Black Jack Environment</a>

In [53]:
# Initialize the environment
env = gym.make('Blackjack-v1', natural = False, sab = False)
env.reset(seed=SEED)

## Define Epsilon Greedy Policy

In [54]:
def epsilon_greedy_policy(env, state, q_values):
    """
    Define epsilon-greedy policy for action selection.
    
    :param state: The current state of the agent.
    :param q_values: A dictionary mapping (state, action) pairs to Q-values.
    :param env: The environment object, which has the action space information.
    :return : The action selected by the epsilon-greedy policy.
    """
    #set epsilon
    epsilon = 0.5

    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        # Initialize the best action and its value
        best_action = None
        best_value = float('-inf')
        
        # Iterate over all possible actions
        for action in range(env.action_space.n):
            # Get the Q-value for the current state-action pair
            q_value = q_values[(state, action)]
            
            # Update the best action if the current Q-value is greater than the best value found so far
            if q_value > best_value:
                best_value = q_value
                best_action = action
        
        return best_action
        
        # One line for the whole else block
        # return max(list(range(env.action_space.n)), key = lambda x: q_values[(state, x)])

## Generate Episodes

In [60]:
# define the number of timesteps
num_of_timesteps = 100

def generate_episode(env, q_values):
    """
    Generate episodes using the epsilon-greedy policy.
    
    :param env: The environment object.
    :param q_values: A dictionary mapping (state, action) pairs to Q-values.
    :return episode: A list of tuples (state, action, reward) representing the episode.
    """
    # Initialize the list for storing the episode
    episode = []
    
    # Initialize the state using the reset function
    state = env.reset()
    
    for t in range(num_of_timesteps):
        # Select the action using the epsilon-greedy policy
        action = epsilon_greedy_policy(env, state, q_values)
        
        # Execute the action
        next_state, reward, done, info, _ = env.step(action)
        
        # Append the state, action, and reward to the episode list
        episode.append((state, action, reward))
        
        # If next state is the terminating state, exit
        if done:
            break
            
        # Update the current state to the next state
        state = next_state
        
    return episode

In [61]:
from collections import defaultdict

# Define the number of episode
num_of_episode = 1000

# Initialize q values
q_values = defaultdict(float)

# Initialize total return
total_return = defaultdict(float)

# Initialize the dictionary for storing the count of state-action visit
N = defaultdict(int)

# Iterate over each episode
for _ in range(num_of_timesteps):
    # Generate episode
    episode = generate_episode(env, q_values)
    
    # Initialize a set to track visited state-action pairs in the episode
    visited_state_action_pairs = set()
    
    # Get all rewards from the episode
    rewards = [r for (s, a, r) in episode]
    
    # Iterate for each step in the episode
    for t, (state, action, _) in enumerate(episode):
        # Only update the state-action pair if it has not been seen before in this episode
        if (state, action) not in visited_state_action_pairs:
            visited_state_action_pairs.add((state, action))

            # Compute the return (sum of rewards from time step t to the end of the episode)
            R = sum(rewards[t:])

            N[(state, action)] += 1
            
            # Compute the average of return and assign to Q value
            q_values[(state, action)] = total_return[(state, action)] / N[(state, action)]

In [59]:
# Extract Q value dictionary into dataframe
df = pd.DataFrame(q_values.items(), columns= ['state-action', 'q value'])