In [2]:
import gym
import numpy as np
np.bool8 = np.bool_

In [3]:
env = gym.make("Blackjack-v1")

In [4]:
obs_space = env.observation_space
action_space = env.action_space
print(f"The observation space: {obs_space}")
print(f"The action space: {action_space}")

The observation space: Tuple(Discrete(32), Discrete(11), Discrete(2))
The action space: Discrete(2)


In [5]:
import matplotlib.pyplot as plt

obs = env.reset()
print(f"The initial observation is {obs}")

random_action = env.action_space.sample()

new_obs, reward, done, _, _ = env.step(random_action)
print(f"The new observation is {new_obs}")

The initial observation is ((19, 6, False), {})
The new observation is (19, 6, False)


In [6]:
minimum = [99, 99, 99]
maximum = [0, 0, 0]

# Get observation space
for i in range(50_000):
  obs = env.reset()
  done = False
  while not done:
    random_action = env.action_space.sample()
    new_obs, reward, done, _, _ = env.step(random_action)
    for i in range(len(minimum)):
      minimum[i] = min(minimum[i], int(new_obs[i]))
      maximum[i] = max(maximum[i], int(new_obs[i]))

print(f"Observation space is from {minimum} to {maximum}")

Observation space is from [4, 1, 0] to [31, 10, 1]


In [7]:
n_actions = 2
n_states = [maximum[i] + 1 for i in range(len(minimum))]

Q_table = np.zeros((*n_states, n_actions))
Q_table.shape

(32, 11, 2, 2)

In [8]:
alpha = 0.1
gamma = 0.9
n_episodes = 1_000_000

Q_table.fill(0) # reset Q_table

for i in range(n_episodes):
  epsilon = 1 - (i / n_episodes)
  obs, _ = env.reset()
  state = tuple(map(int, obs))

  done = False
  while not done:
    if np.random.random() < epsilon:
      action = env.action_space.sample()
    else:
      action = np.argmax(Q_table[*state])
    
    obs, reward, done, _, _ = env.step(action)
    new_state = tuple(map(int, obs))

    old = Q_table[*state][action]
    target = reward + gamma * np.max(Q_table[*new_state])
    Q_table[*state][action] = (1-alpha)*old + alpha*target
    
    state = new_state

In [9]:
Q_table[7][8][1] # checking the value of a random state

array([0., 0.])

In [42]:
from time import sleep
from matplotlib.pyplot import imshow

total_reward = 0
obs, _ = env.reset()
state = tuple(map(int, obs))

done = False

total_reward = 0

while not done:
  action = np.argmax(Q_table[*state])
  print(f"Player's sum: {state[0]}")
  print(f"Dealer's showing card: {state[1]}")
  print(f"Usable ace: {state[2]}")
  print(f"Action: {"Hit" if action == 1 else "Stay"}\n")
  obs, reward, done, _, _ = env.step(action)
  total_reward += reward
  state = tuple(map(int, obs))

print(f"Total reward: {total_reward}")

Player's sum: 12
Dealer's showing card: 10
Usable ace: 0
Action: Hit

Player's sum: 21
Dealer's showing card: 10
Usable ace: 0
Action: Stay

Total reward: 1.0
