In [19]:
import gym
import numpy as np
import random
from collections import defaultdict

In [24]:
# https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py
env = gym.make('Blackjack-v0')
print(env.observation_space)
print(env.action_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))
Discrete(2)


In [17]:
qtable = np.zeros((32, 11, 2, 2))
# print(qtable)
qtable_dict = defaultdict(lambda: np.zeros(2))  # array of actions

def qtable_func(state):
    # TODO: magic! This is a non-functional example
    return np.array([0.5, 0])

In [44]:
total_episodes = 100000       # Total episodes
learning_rate = 0.6           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.99                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [45]:
# List of rewards
rewards = []

# Learn through the episodes
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        player_total = state[0]
        dealer_showing = state[1]
        has_ace = 1 if state[2] else 0
        
        # Action selection - decide if we explore or exploit
        if random.uniform(0, 1) < epsilon:
            # Time to explore!
            action = env.action_space.sample()
        else:
            # Exploit based on best available rewards
            action = np.argmax(qtable_dict[state])
#             action = np.argmax(qtable[player_total,
#                                       dealer_showing,
#                                       has_ace, :])
        
        # Take the action, observe the outcome and reward
        new_state, reward, done, info = env.step(action)
        
        
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[player_total,
               dealer_showing,
               has_ace,
               action] = (qtable[player_total,
                                 dealer_showing,
                                 has_ace,
                                 action] +
                          learning_rate *
                          (reward + gamma *
                           np.max(qtable[new_state[0], new_state[1], 1 if new_state[2] else 0, :]) -
                           qtable[player_total, dealer_showing, has_ace, action]))
        
        
        # Update the qtable with new expected rewards
        # qtable_dict[state][action] += (learning_rate *
        #                                (reward + gamma *
        #                                 (np.max(qtable_dict[new_state]) -
        #                                  np.max(qtable_dict[state]))))
        
        
        total_rewards += reward
        state = new_state
        if done:
            break
    
    # Reduce epsilon (explore less)
    epsilon = (min_epsilon +
               (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode))
    rewards.append(total_rewards)

print('Score over time:', sum(rewards) / total_episodes)

Score over time: -0.16283


In [30]:
# Blackjack is hard (even optimum loses money)
# How do we compare to random?

# List of rewards
rewards = []

# Learn through the episodes
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Action selection - always random
        action = env.action_space.sample()
        
        # Take the action, observe the outcome and reward
        new_state, reward, done, info = env.step(action)
              
        total_rewards += reward
        state = new_state
        if done:
            break
    
    rewards.append(total_rewards)

print('Score over time:', sum(rewards) / total_episodes)

Score over time: -0.39345


In [46]:
# So we're about twice as good (or half as bad) as random
# Let's play Blackjack!
def print_state(state):
    print('---')
    print('Player sum:', state[0])
    print('Dealer showing:', state[1])
    print('Player has usable ace:', state[2])

rewards = 0
hands = 5

for hand in range(hands):
    state = env.reset()
    print('****************************************************')
    print('HAND', hand)
    
    for step in range(max_steps):
        print_state(state)
        # Take the action with max expected future reward
        action = np.argmax(qtable_dict[state])
        
        print('Hit me!' if action else 'Stay.')
        
        state, reward, done, info = env.step(action)
        
        if done:
            rewards += reward
            print('HAND DONE')
            if reward == 1.0:
                print('You win!')
            elif reward == 0.0:
                print('Draw.')
            else:
                print('You lose!')
            break

print('****************************************************')
print('ALL HANDS COMPLETE')
print('Total score:', rewards)
print('Average per hand:', rewards / hands)
env.close()

****************************************************
HAND 0
---
Player sum: 13
Dealer showing: 2
Player has usable ace: False
Hit me!
HAND DONE
You lose!
****************************************************
HAND 1
---
Player sum: 12
Dealer showing: 3
Player has usable ace: False
Hit me!
---
Player sum: 21
Dealer showing: 3
Player has usable ace: False
Stay.
HAND DONE
You win!
****************************************************
HAND 2
---
Player sum: 18
Dealer showing: 10
Player has usable ace: False
Hit me!
HAND DONE
You lose!
****************************************************
HAND 3
---
Player sum: 17
Dealer showing: 10
Player has usable ace: False
Hit me!
HAND DONE
You lose!
****************************************************
HAND 4
---
Player sum: 21
Dealer showing: 2
Player has usable ace: True
Stay.
HAND DONE
You win!
****************************************************
ALL HANDS COMPLETE
Total score: -1.0
Average per hand: -0.2


In [36]:
qtable_dict

defaultdict(<function __main__.<lambda>()>,
            {(7, 9, False): array([-2.4       , 22.01037774]),
             (9, 9, False): array([ -0.8       , 207.90267968]),
             (19, 9, False): array([259.2      , -50.4566784]),
             (21, 3, True): array([287.2  ,  -8.512]),
             (20, 10, False): array([  2176.  , -26449.44]),
             (29, 10, False): array([0., 0.]),
             (12, 7, False): array([ -1.6       , 468.77281159]),
             (15, 10, False): array([-4.8       , 67.99439883]),
             (12, 10, False): array([-13.6       ,   0.16904587]),
             (8, 8, False): array([ -0.8       , 103.19940133]),
             (15, 2, False): array([-5.6       , 42.65210215]),
             (8, 10, False): array([ -3.2       , 196.42314785]),
             (13, 3, False): array([-1.6       , 89.67811281]),
             (22, 3, False): array([0., 0.]),
             (18, 10, False): array([-10.4       ,  20.63490199]),
             (21, 9, True): arr

In [49]:
qtable[18, 10, 0, :]

array([-0.20743301,  0.81431046])