In [1]:
# Importing dependencies
import gym
import random
import numpy as np

In [2]:
# Creating Blackjack environment
env = gym.make('Blackjack-v0')

In [3]:
# Displaying the space of all possible states and actions
# 32 * 11 * 2 different states (32 different card sums for player, 11 different face up dealer cards, 2 possibilities for number of aces in player possesios)
print(env.observation_space)
print(env.action_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))
Discrete(2)


In [4]:
# Random agent randomly "hits" or "sticks" on each state

class RandomAgent:
    def __init__(self, env):
        self.num_states = 32 * 11 * 2
        self.num_actions = env.action_space.n
    
    def getAction(self, state):
        return random.choice(range(self.num_actions))

In [8]:
# Model-free Q-learning Agent learns from playing episodes

class QAgent:
    def __init__(self, env):
        self.num_states = 32 * 11 * 2
        self.num_actions = env.action_space.n
        self.qtable = np.zeros([32 * 11 * 2, 2])
    
    # Hash table from state to integer
    def index(self, state):
        return state[0] + state[1] * 32 + state[2] * 32 * 11
    
    # Train function. Parameters such as num_episodes, learning_rate, discount_rate, min_exp (min exploration rate) can be tweeked
    
    def train(self, num_episodes = 500000, max_steps = 8, learning_rate = 0.15, discount_rate = 1, min_exp = 0.001):
        # Initialize default exploration rate. This represents probability that the agent will "explore" instead of "exploit"
        exploration_rate = 1
        # Learn state-action values from ~500,000 games
        for episode in range(num_episodes):
            state = env.reset()
            done = None

            for step in range(max_steps): 
                # Choose an action randomly with probability (exploration_rate); choose action with highest Q-value otherwise.
                sample = random.uniform(0, 1)
                if sample > exploration_rate:
                    action = np.argmax(self.qtable[self.index(state), :]) 
                else:
                    action = env.action_space.sample()
            
                # Take action
                next_s, reward, done, info = env.step(action)
        
                # Update Q-value according to Bellmann Equation
                self.qtable[self.index(state), action] = (1 - learning_rate) * self.qtable[self.index(state), action] + learning_rate * (reward + discount_rate * np.max(self.qtable[self.index(next_s), :]))
        
                # Transition to next state
                state = next_s
        
                if done == True: 
                    break
            
            # Update exploration rate (exponential decay with a minimum exploration rate)
            exploration_rate = min_exp + (1 - min_exp) * np.exp(-episode * 0.001)    
            
    def getAction(self, state):
        return np.argmax(self.qtable[self.index(state), :])

In [9]:
# TRAINING. With default parameters, make take up to a couple of minutes. Use commented line instead of 4th line for faster (but less optimal) training

state = env.reset()
q_agent = QAgent(env)
# q_agent.train(num_episodes = 100000)
q_agent.train()

In [19]:
# TESTING

# Compare with a random agent
r_agent = RandomAgent(env)

# Test on 10,000 games
testing_episodes = 10000
max_steps = 6

num_wins = 0
num_losses = 0
num_ties = 0

payout = 0

for episode in range(testing_episodes):
    state = env.reset()
    cur_reward = 0
    
    for step in range(max_steps):
        # action = r_agent.getAction(state)
        action = q_agent.getAction(state) # Get action from Q agent
        
        # Apply action
        state, reward, done, info = env.step(action)
        
        cur_reward += reward
        
        if done:
            break
    
    # Update wins/losses/ties according to reward
    if (cur_reward == 1):
        num_wins += 1
    elif (cur_reward == 0):
        num_ties += 1
    else:
        num_losses += 1
        
    payout += cur_reward
        
# Print diagnostics
print("Win rate:", num_wins / testing_episodes)
print("Loss rate:", num_losses / testing_episodes)
print("Tie rate:", num_ties / testing_episodes)
print("Payout over 10,000 games:", payout)

Win rate: 0.4005
Loss rate: 0.514
Tie rate: 0.0855
Payout over 10,000 games: -1135.0


In [20]:
# Run this cell to see the agent play!

state = env.reset()
reward = -10

print("State is described as (sum of player cards, face up dealer card, whether or not player has an ace).")
print("Start State:", state)

names = ["Sticks", "Hits"]
participles = ["Sticking", "Hitting"]

for i in range(6):
    action = q_agent.getAction(state)
    
    print("Agent", names[action])
    
    state, reward, done, info = env.step(action)
    
    print("State after", participles[action])
    print(state)
    
    if done:
        break
        
print("\n Final Reward:", reward)

State is described as (sum of player cards, face up dealer card, whether or not player has an ace).
Start State: (11, 6, False)
Agent Hits
State after Hitting
(21, 6, False)
Agent Sticks
State after Sticking
(21, 6, False)

 Final Reward: 1.0
