In [None]:
from blackjack_game import Blackjack
from blackjack_strategy import Strategy
from dqn import DQN
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
# Hide deprecation warnings
import warnings
warnings.filterwarnings("ignore")

## Strategy

In [None]:
# Get hard and soft tables of basic strategy from file
file_path = './strategy/strategy_'
basic_strategy = Strategy(src_hard='./strategy/strategy_hard.csv', src_soft='./strategy/strategy_soft.csv')

In [None]:
def get_strategy(rl_agent):
    # Create empty strategy
    strategy = Strategy()
    # Create Blackjack environment for card generation
    game = Blackjack()
    # Iterate over all values for dealer's card (1-10)
    for dealers_card in range(1, 11):
        # Iterate over values for player's sum (4-20)
        for players_sum in range(4, 21):
            # Get agent's primary action
            observation, invalid_actions = game.reset(players_sum, dealers_card, False)
            action = rl_agent.play_one_step(observation, 0, invalid_actions)
            # If first choice is 2 (=double) or 3 (=surrender), get alternative action
            if action == 2 or action == 3:
                observation[-1] = 0
                invalid_actions += [2, 3]
                alternative = rl_agent.play_one_step(observation, 0, invalid_actions)
                # If alternative action is 1 (=hit), table entry is 2 * action
                if alternative == 1:
                    action *= 2
            strategy.hard[players_sum - 4, dealers_card - 1] = action
        # Iterate over values for player's sum (12-20)
        for players_sum in range(12, 21):
            # Get agent's primary action
            observation, invalid_actions = game.reset(players_sum, dealers_card, True)
            action = rl_agent.play_one_step(observation, 0, invalid_actions)
            # If first choice is 2 (=double) or 3 (=surrender), get alternative action
            if action == 2 or action == 3:
                observation[-1] = 0
                invalid_actions += [2, 3]
                alternative = rl_agent.play_one_step(observation, 0, invalid_actions)
                # If alternative action is 1 (=hit), table entry is 2 * action
                if alternative == 1:
                    action *= 2
            strategy.soft[players_sum - 12, dealers_card - 1] = action
    return strategy


def test_strategy(strategy, n_games, seed=None, show=False):
    # Create new environment and play n games (action values are read from strategy tables) to collect game data
    game = Blackjack(seed)
    rewards = []
    for n in range(n_games):
        game.reset()
        if show:
            print('Game {}/{}:\n'.format(n + 1, n_games))
            game.show()
        while not game.done:
            # Look up action in correct table (soft or hard)
            action = strategy.action(game.players_sum, game.dealer[0], game.usable_ace)
            # Case: action is DS or DH
            if action == 2 or action == 4:
                if not game.first_round:
                    action = int(action / 2 == 2)
                else:
                    action = 2
            # Case: action is RS or RH
            if action == 3 or action == 6:
                if not game.first_round:
                    action = int(action / 2 == 3)
                else:
                    action = 3
            game.step(action)
            if show:
                game.show()
        rewards.append(game.reward)
        if show:
            print()
    # Get relative frequency of wins (positive values), losses (negative values), and ties (0s)
    s_rewards = np.sign(rewards)
    rel_freq = lambda r: np.sum(s_rewards == r) / n_games
    data = {'Wins': [rel_freq(1)],
            'Losses': [rel_freq(-1)],
            'Draws': [rel_freq(0)],
            'Mean score': [np.mean(rewards)]}
    # Print data frame and return mean score
    if not show:
        print(pd.DataFrame(data).to_string(index=False))
    return data['Mean score'][0]

## Deep Q-Learning

In [None]:
# Define necessary parameters for agent
n_obs = 30
hidden_layers = [(64, 'elu'), (64, 'elu')]
n_actions = 4
lr_decay = keras.optimizers.schedules.PolynomialDecay(1e-3, 72000, 1e-5)
optimizer = keras.optimizers.Adam(learning_rate=lr_decay)
discount_factor = 0.99
buffer_size = 120000
# Best 64, 64 hidden, lr_decay=(1e-3, 72000, 1e-5), optimal_lr=2.5e-4, df=0.99
file = None

agent = DQN(n_obs, hidden_layers, n_actions, optimizer, discount_factor, buffer_size, file)

In [None]:
# Define necessary parameters for session
n_episodes = 80000
n_pretrain = 20000
validation_interval = 500
validation_games = 10000
validation_seed = random.randint(0, 999)
update_target_interval = 4000
batch_size = 256
epsilon_decay = 0.99, 0.01, 70000

# Game is played for e episodes
env = Blackjack()
mean_scores, matches = [], []
best = None
for e in range(n_episodes + n_pretrain):
    # Get number of training episode
    episode = e - n_pretrain
    # Calculate epsilon for current episode (epsilon=1 during pretrain)
    if episode < 0:
        epsilon = 1
    else:
        epsilon = epsilon_decay[0] - min(episode / epsilon_decay[2], 1) * (epsilon_decay[0] - epsilon_decay[1])
    # Get initial state and done value
    state, invalid = env.reset()
    done = False
    while not done:
        # Let agent select action (random action with probability epsilon)
        agents_action = agent.play_one_step(state, epsilon, invalid)
        # Play step and get next state and boolean value of done
        next_state, reward, done, invalid = env.step(agents_action)
        # Add experience to agent's replay buffer before updating current state
        agent.add_experience(state, agents_action, reward, next_state, done, invalid)
        state = next_state
    if episode >= 0:
        # Let agent perform training step
        agent.training_step(batch_size)
        if episode % update_target_interval == 0 and episode > 0:
            agent.update_target_model()
        # Get agent's strategy tables and test them
        if (episode + 1) % validation_interval == 0:
            agents_strategy = get_strategy(agent)
            matches.append(agents_strategy.match(basic_strategy))
            # Print match percentage compared to basic strategy
            output_str = 'Episode {}/{} '.format(episode + 1, n_episodes)
            output_str += '- match (basic strategy) = {}%'.format(round(matches[-1] * 100, 1))
            print(output_str)
            # Get mean score of test and reset seed afterwards
            mean_scores.append(test_strategy(agents_strategy, validation_games, validation_seed))
            random.seed()
            # Save model if it scored best
            if best is None or mean_scores[-1] >= best:
                agent.save('Blackjack_dqn')
                best = mean_scores[-1]
            print()

# Plot scores
f, (ax1, ax2) = plt.subplots(2)
ax1.plot(range(len(mean_scores)), mean_scores)
ax2.plot(range(len(matches)), matches)
ax1.set_ylabel('Mean score')
ax2.set_ylabel('Match')
ax2.set_xlabel('Episode')

## Testing

In [None]:
# Get agent's strategy
agents_strategy = get_strategy(agent)

In [None]:
# Print strategy tables
print('Strategy tables:\n')
agents_strategy.output()

In [None]:
# Print match with basic strategy
print('Vs. basic strategy: {}%'.format(round(agents_strategy.match(basic_strategy) * 100, 2)))

In [None]:
# Test strategy and print result
n_of_games = 1000000
print('Result ({} games):\n'.format(n_of_games))
test_strategy(agents_strategy, n_of_games)

In [None]:
# Play and show games
test_strategy(agents_strategy, 10000, show=True)