# Playing BlackJack with First Visit MC

 First, let us import all the necessary libraries

In [8]:
import gym
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import defaultdict
from functools import partial
%matplotlib inline
plt.style.use('ggplot')

 Simulate the Blackjack environment

In [9]:
#env = gym.make('Blackjack-v1', render_mode='human')
env = gym.make('Blackjack-v1')

Then we define the policy function which takes the current state and check if the score is
greater than or equal to 20, if yes we return 0 else we return 1. i.e If the score is greater
than or equal to 20 we stand (0) else we hit (1)

In [10]:
def sample_policy(observation):
    score, dealer_score, usable_ace = observation
    return 0 if score >= 20 else 1

We define a function called generate_episode for generating epsiodes

In [11]:
def generate_episode(policy, env):
    
    # we initialize the list for storing states, actions, and rewards
    states, actions, rewards = [], [], []
    
    # Initialize the gym environment
    observation = env.reset()[0]
    
    while True:
        
        # append the states to the states list
        states.append(observation)
        
        # now, we select an action using our sample_policy function and append the action to actions list
         
        action = sample_policy(observation)
        actions.append(action)
        
        # We perform the action in the environment according to our sample_policy, move to the next state 
        # and receive reward
        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        
        # Break if the state is a terminal state
        if (terminated or truncated ):
             break
                
    return states, actions, rewards

 Now that we learned how to generate an episode, we will see how to perform First Vist MC Prediction

In [12]:
def every_visit_mc_prediction(policy, env, n_episodes):
    
    # First, we initialize the empty value table as a dictionary for storing the values of each state
    value_table = defaultdict(float)
    N = defaultdict(int)

    returns = 0
    for _ in range(n_episodes):
        
        # Next, we generate the epsiode and store the states and rewards
        states, _, rewards = generate_episode(policy, env)
        
        
        # Then for each step, we store the rewards to a variable R and states to S, and we calculate
        # returns as a sum of rewards
        
        for t in range(len(states) - 1, -1, -1):
            R = rewards[t]
            S = states[t]
            
            returns += R
            
            # Now to perform first visit MC, we check if the episode is visited for the first time, if yes,
            # we simply take the average of returns and assign the value of the state as an average of returns
            
            if S in states[:t]:
                N[S] += 1
                value_table[S] += (returns - value_table[S]) / N[S]
    
    return value_table

In [13]:
value = every_visit_mc_prediction(sample_policy, env, n_episodes=500000)

  if not isinstance(terminated, (bool, np.bool8)):


Let us see first few elements in the value table

In [14]:
for i in range(10):
  print(value.popitem())

KeyError: 'popitem(): dictionary is empty'

 We define the function plot_blackjack for plotting the value function and we can see how our value function is attaining the convergence. 

In [None]:
def plot_blackjack(V, ax1, ax2):
    player_sum = np.arange(12, 21 + 1)
    dealer_show = np.arange(1, 10 + 1)
    usable_ace = np.array([False, True])
    state_values = np.zeros((len(player_sum), len(dealer_show), len(usable_ace)))

    for i, player in enumerate(player_sum):
        for j, dealer in enumerate(dealer_show):
            for k, ace in enumerate(usable_ace):
                state_values[i, j, k] = V[player, dealer, ace]
    
    X, Y = np.meshgrid(player_sum, dealer_show)
 
    ax1.plot_wireframe(X, Y, state_values[:, :, 0])
    ax2.plot_wireframe(X, Y, state_values[:, :, 1])
 
    for ax in ax1, ax2:
        ax.set_zlim(-1, 1)
        ax.set_ylabel('player sum')
        ax.set_xlabel('dealer showing')
        ax.set_zlabel('state-value')
        

In [None]:
fig, axes = pyplot.subplots(nrows=2, figsize=(5, 8),
subplot_kw={'projection': '3d'})
axes[0].set_title('value function without usable ace')
axes[1].set_title('value function with usable ace')
plot_blackjack(value, axes[0], axes[1])

In [37]:
import gym
import numpy as np

env = gym.make('Blackjack-v1')

def generate_episode(env):
    episode = []
    state = env.reset()
    while True:
        action = env.action_space.sample()  # Random policy
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if (terminated or truncated):
            break
    return episode

def mc_prediction(env, num_episodes):
    returns_sum = {}
    returns_count = {}
    V = np.zeros((32, 11, 2))

    for _ in range(num_episodes):
        episode = generate_episode(env)
        states, _, _ = zip(*episode)
        discounts = np.array([0.9**i for i in range(len(episode)+1)])
        for i, state in enumerate(states):
            sum_val, dealer_card = state
            if 0 <= sum_val[0] <= 31 and 1 <= dealer_card[0] <= 10:
                state_idx = (sum_val[0], dealer_card[0], 0)  # Assuming usable_ace is always 0
                if state_idx not in returns_sum:
                    returns_sum[state_idx] = 0
                    returns_count[state_idx] = 0
                returns_sum[state_idx] += sum([r * discounts[j] for j, (_, _, r) in enumerate(episode[i:])])
                returns_count[state_idx] += 1
                V[state_idx] = returns_sum[state_idx] / returns_count[state_idx]

    return V

V = mc_prediction(env, 500000)
print(V)


KeyError: 0