# Playing BlackJack with epsilon-greedy policy MC

 First, let us import all the necessary libraries

In [29]:
import gym
import numpy as np
import random
from matplotlib import pyplot
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import defaultdict
from functools import partial
import pandas as pd

%matplotlib inline
plt.style.use('ggplot')

 Simulate the Blackjack environment

In [30]:
env = gym.make('Blackjack-v1', render_mode='human')
#env = gym.make('Blackjack-v1')

Initialize the dictionary for storing the Q values:

In [31]:
Q = defaultdict(float)

Initialize the dictionary for storing the total return of the state-action pair:

In [32]:
total_return = defaultdict(float)

Initialize the dictionary for storing the count of the number of times a state-action pair is visited:

In [33]:
N = defaultdict(int)

Initialize the dictionary for storing the count of the number of times a state-action pair is visited:

In [34]:
def epsilon_greedy_policy(state, Q):
    epsilon = 0.5
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

### Generating an episode

let's generate an episode using the epsilon-greedy policy. We define a function called generate_episode, which takes the Q value as an input and returns the episode.


First, let's set the number of time steps:

In [35]:
num_timesteps = 100

In [36]:
def generate_episode(Q):
    episode =[]
    state = env.reset()[0]
    for t in range(num_timesteps):
        action = epsilon_greedy_policy(state, Q)
        next_state, reward, terminated, truncated, info = env.step(action)
        episode.append((state,action, reward))
        if (terminated or truncated):
            break
        state = next_state
    return episode


Computing the optimal policy

 Let's learn how to compute the optimal policy. First, let's set the number of iterations, that is, the number of episodes, we want to generate:

In [37]:
num_iterations = 500000
for i in range(num_iterations):
    episode = generate_episode(Q)
    # get all state action pair in the episode
    all_state_action_pairs = [(s,a) for (s,a,r) in episode]
    # Store all the rewards obtained in the episode in the rewards list:
    rewards = [r for (s, a, r) in episode]
    for t, (state, action,_) in enumerate(episode):
        if not (state, action) in all_state_action_pairs[0:t]:
            R = sum(rewards[t:])
            total_return[(state,action)] = total_return[(state,action)] + R
            N[(state, action)] += 1
            Q[(state,action)] = total_return[(state, action)] /N[(state, action)]






  if not isinstance(terminated, (bool, np.bool8)):


In [None]:
df = pd.DataFrame(Q.items(),columns=['state_action pair','value'])
df.head(11)


Unnamed: 0,state_action pair,value
0,"((13, 9, False), 1)",-0.488686
1,"((9, 9, False), 1)",-0.223282
2,"((13, 9, False), 0)",-0.584177
3,"((17, 6, False), 0)",-0.024023
4,"((17, 6, False), 1)",-0.585973
5,"((17, 1, False), 1)",-0.760643
6,"((18, 1, False), 0)",-0.357638
7,"((14, 10, False), 0)",-0.568738
8,"((14, 10, False), 1)",-0.610974
9,"((6, 10, False), 1)",-0.472445
