# Monte Carlo ES

On-policy method with uniform test of intial state

In [1]:
import numpy as np
from numpy.linalg import inv, norm
from numpy.random import choice

#Blackjack rules
#draw at random 1 - 10, 10 ,10, 10
#dealer shows one card.
#player takes two and holds on 20 or 21
#state is dealer-showing, player sum and useable ace.
#dealer sticks on 17 or greater

In [2]:
# Usable Ace
def uaFunc(pCards):
    if min(pCards) > 1: return 0
    if min(pCards) == 1:
        if sum(pCards) <= 11: 
            return 1
        else: return 0 

In [3]:
def stateIdx(dCard, pCards):
    i = dCard - 1
    k = uaFunc(pCards)
    j = sum(pCards) + 10*k - 12
    return [i, j, k] 

In [4]:
def pSum(pCards):
    return sum(pCards) + 10*uaFunc(pCards)

In [5]:
def initState():
    dCard = choice(cards)
    pCards = list(choice(cards, 2, True))
    #print(pCards)
    while pSum(pCards) < 12:
        pCards.append(choice(cards))
    return dCard, pCards

In [6]:
def pPolicy(dCard, pCards, epsilon):
    dsIdx, psIdx, uaIdx = stateIdx(dCard, pCards)
    
    action_values = actionValueMatrix[dsIdx][psIdx][uaIdx]
    if np.random.uniform(0.0, 1.0) < epsilon or (action_values[0] == action_values[1]):
        return choice([0, 1])
    else:
        return np.argmax(action_values) 
    

In [7]:
def pReward(dCard, pCards):
    dCards = [dCard]
    while pSum(dCards) < 17:
        dCards.append(choice(cards))
    if pSum(dCards) > 21:
        return 1
    elif pSum(dCards) >= pSum(pCards):
        return -1
    else:
        return 1

In [8]:
def printRewards():
    print
    print 'Average Reward by state'
    for k in [0,1]:
        print 'k =', k
        for i in range(10):
            print [actionValueMatrix[i, j, k] for j in range(10)]
        print

In [9]:
def printPolicy():
    print
    print 'Policy'
    for k in [0,1]:
        print 'k =', k
        for i in range(10):
            print [actionValueMatrix[i, j, k] for j in range(10)]
        print 

In [10]:
def updatePolicy(rm, stateRec):
    for dsIdx, psIdx, uaIdx, aIdx in stateRec:
        actionValueMatrix[ dsIdx, psIdx, uaIdx, aIdx] = rm[ dsIdx, psIdx, uaIdx, aIdx]

In [11]:
stateMatrix = np.zeros((10, 10, 2))
cards = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
actionValueMatrix = np.random.normal(size=(10, 10, 2, 2))

ua = [0,1] 
dVals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
pVals = [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

#seq of events
#draw dealer's card
#draw players cards to reach 12
#determine ua or nua
#take more player cards - keep list of sums (or indices)

numMC = 1000
epsilon = 0

for epoch in range(1000):
    rewardMatrix = np.zeros((10, 10, 2, 2))
    countMatrix = np.zeros((10, 10, 2, 2)) 
    
    if epoch % 40 == 0:
        epsilon /= 1.1
    
    for i in range(numMC):
        stateRec = []
        
        #set initial state
        dCard, pCards = initState()
        ua = uaFunc(pCards)

        #record initial state
        dsIdx, psIdx, uaIdx = stateIdx(dCard, pCards)
        action = pPolicy(dCard, pCards, epsilon)
        stateRec.append((dsIdx, psIdx, uaIdx, action))

        #exercise player policy
        while (pSum(pCards) < 21) and action:
            # draw a card
            pCards.append(choice(cards))
            #record state each step
            dsIdx, psIdx, uaIdx = stateIdx(dCard, pCards)
            if (pSum(pCards) < 21):
                action = pPolicy(dCard, pCards, epsilon)
                stateRec.append((dsIdx, psIdx, uaIdx, action))

        #if player busts: assign reward -1
        if pSum(pCards) > 21:
            for dsIdx, psIdx, uaIdx, aIdx in stateRec:
                rewardMatrix[dsIdx][psIdx][uaIdx][aIdx] = -1
        else:
            reward = pReward(dCard, pCards)
            for dsIdx, psIdx, uaIdx, aIdx in stateRec:
                rewardMatrix[dsIdx][psIdx][uaIdx][aIdx] += reward            
                countMatrix[dsIdx][psIdx][uaIdx][aIdx] += 1   


    rm =  rewardMatrix / (countMatrix + 1e-7)
    updatePolicy(rm,stateRec)

    
    
print epsilon
    

0.0


In [12]:
printPolicy()


Policy
k = 0
[array([-1.70595322, -0.66666666]), array([-0.63636363, -0.9999999 ]), array([-0.99999999, -0.24999999]), array([-0.85714285, -1.9999998 ]), array([-0.99999999, -0.33333332]), array([-1.39623591, -0.33333332]), array([-0.49999999, -0.9999999 ]), array([-0.49999999, -0.9999999 ]), array([-0.16666667, -0.31986319]), array([ 0.03011266,  1.24416578])]
[array([-0.5434932, -0.125    ]), array([ 0.       , -1.9999998]), array([-0.7142857 , -0.33333332]), array([-0.49999999, -0.87123258]), array([-0.76648548, -0.33333332]), array([-0.5      , -1.9999998]), array([-0.59999999, -0.49999998]), array([ 0.66666666, -0.49999998]), array([  5.99999994e-01,  -1.00000000e+07]), array([ 0.99999997, -1.00432363])]
[array([-0.7142857, -0.2      ]), array([        0., -10000000.]), array([ -1.66666665e-01,  -1.00000000e+07]), array([-1.08228868,  0.        ]), array([ -8.18181811e-01,  -1.00000000e+07]), array([-0.99999999, -0.9999999 ]), array([ 0.66666666, -0.9999999 ]), array([ -9.9999998