In [2]:
import numpy as np
import random

# Define constants
HIT = 0
STAND = 1
ACTIONS = [HIT, STAND]

# Initialize the policy, Q-values, and returns
Q = {}
policy = {}
returns_sum = {}
returns_count = {}

# Function to initialize the state-action space
def initialize_state_action_space():
    for player_sum in range(12, 22):
        for dealer_card in range(1, 11):
            for usable_ace in [True, False]:
                state = (player_sum, dealer_card, usable_ace)
                for action in ACTIONS:
                    Q[(state, action)] = 0
                    returns_sum[(state, action)] = 0
                    returns_count[(state, action)] = 0
                # Initialize policy with random actions
                policy[state] = HIT if random.random() < 0.5 else STAND

# Simulate a Blackjack game episode with exploring starts
def generate_episode():
    # Randomly pick a start state and action
    player_sum = random.randint(12, 21)
    dealer_card = random.randint(1, 10)
    usable_ace = random.choice([True, False])
    state = (player_sum, dealer_card, usable_ace)
    action = random.choice(ACTIONS)

    episode = []
    episode.append((state, action))
    # Play the game
    while True:
        # Take the action
        if action == HIT:
            # Simulate a hit (simplified)
            card = random.randint(1, 10)
            if card == 1 and player_sum + 11 <= 21:
                player_sum += 11
                usable_ace = True
            else:
                player_sum += card
            if player_sum > 21 and usable_ace:
                player_sum -= 10
                usable_ace = False
            elif player_sum > 21:
                reward = -1
                break
        else:
            # Simulate standing and let dealer play (simplified)
            while dealer_card < 17:
                dealer_card += random.randint(1, 10)
            reward = 1 if player_sum > dealer_card or dealer_card > 21 else -1 if player_sum < dealer_card else 0
            break

        state = (player_sum, dealer_card, usable_ace)
        action = policy.get(state, HIT if random.random() < 0.5 else STAND)
        episode.append((state, action))

    return episode, reward

# Monte Carlo Control with Exploring Starts
def monte_carlo_control(num_episodes=500000):
    initialize_state_action_space()

    for _ in range(num_episodes):
        # Generate an episode
        episode, reward = generate_episode()

        # Calculate returns and update Q-values
        for state, action in episode:
            returns_sum[(state, action)] += reward
            returns_count[(state, action)] += 1
            Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]

            # Update the policy to be greedy w.r.t. Q
            state_actions = [(Q[(state, a)], a) for a in ACTIONS]
            best_action = max(state_actions, key=lambda x: x[0])[1]
            policy[state] = best_action

    return policy

# Run the algorithm and get the optimal policy
optimal_policy = monte_carlo_control()

# Print the optimal policy
for state, action in optimal_policy.items():
    print(f"State: {state}, Optimal Action: {action}")


State: (12, 1, True), Optimal Action: 0
State: (12, 1, False), Optimal Action: 0
State: (12, 2, True), Optimal Action: 0
State: (12, 2, False), Optimal Action: 0
State: (12, 3, True), Optimal Action: 0
State: (12, 3, False), Optimal Action: 0
State: (12, 4, True), Optimal Action: 0
State: (12, 4, False), Optimal Action: 0
State: (12, 5, True), Optimal Action: 0
State: (12, 5, False), Optimal Action: 0
State: (12, 6, True), Optimal Action: 0
State: (12, 6, False), Optimal Action: 0
State: (12, 7, True), Optimal Action: 0
State: (12, 7, False), Optimal Action: 0
State: (12, 8, True), Optimal Action: 0
State: (12, 8, False), Optimal Action: 0
State: (12, 9, True), Optimal Action: 0
State: (12, 9, False), Optimal Action: 0
State: (12, 10, True), Optimal Action: 0
State: (12, 10, False), Optimal Action: 0
State: (13, 1, True), Optimal Action: 0
State: (13, 1, False), Optimal Action: 0
State: (13, 2, True), Optimal Action: 0
State: (13, 2, False), Optimal Action: 0
State: (13, 3, True), Opti