In [1]:
import gym
import numpy as np
from algorithms import *
import matplotlib as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable


In [2]:
env = gym.make('Blackjack-v1')

In [None]:
def plot_blackjack_values(V):
    
    def get_Z(x, y, usable_ace):
        if (x,y,usable_ace) in V:
            return V[x,y,usable_ace]
        else:
            return 0

    def get_figure(usable_ace, ax):
        x_range = np.arange(11, 22)
        y_range = np.arange(1, 11)
        X, Y = np.meshgrid(x_range, y_range)
        
        Z = np.array([get_Z(x,y,usable_ace) for x,y in zip(np.ravel(X), np.ravel(Y))]).reshape(X.shape)

        surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=plt.cm.coolwarm, vmin=-1.0, vmax=1.0)
        ax.set_xlabel('Player\'s Current Sum')
        ax.set_ylabel('Dealer\'s Showing Card')
        ax.set_zlabel('State Value')
        ax.view_init(ax.elev, -120)

    fig = plt.figure(figsize=(20, 20))
    ax = fig.add_subplot(211, projection='3d')
    ax.set_title('Usable Ace')
    get_figure(True, ax)
    ax = fig.add_subplot(212, projection='3d')
    ax.set_title('No Usable Ace')
    get_figure(False, ax)
    plt.show()

In [3]:
def sample_policy(observation):
    score, _, _ = observation
    return 0 if score >= 20 else 1

In [4]:
V = first_visit_mc_prediction_state_value(sample_policy, env, num_episodes=10000)

100%|██████████| 10000/10000 [00:00<00:00, 10168.76it/s]


In [None]:
plot_blackjack_values(V)

In [6]:
es_policy = defaultdict()

for sum in range(1, 33):
    for dealer in range (1, 11):
        for usable in [False, True]:
            es_policy[(sum, dealer, usable)] = np.random.choice([0, 1])
            

In [7]:
Q, pi = monte_carlo_exploring_starts(es_policy, env, 100000)

100%|██████████| 100000/100000 [00:09<00:00, 10556.35it/s]
