In [1]:
from easy21 import *
import numpy as np

- Apply Monte-Carlo control to Easy21. 
- [x] Initialise the value function to zero. 
- [x] Use a time-varying scalar step-size of αt = 1/N(st,at) 
- [x] and an ε-greedy exploration strategy with εt = N0/(N0 + N(st)), 
- [x] where N0 = 100 is a constant, 
- [x] N(s) is the number of times that state s has been visited, 
- [x] and N(s,a) is the number of times that action a has been selected from state s. 
- [x] Feel free to choose an alternative value for N0, if it helps producing better results. 
- [ ] Plot the optimal value function V ∗ (s) = maxa Q∗ (s, a) using similar axes to the following figure taken from Sutton and Barto’s Blackjack example.

In [2]:
class MC_Agent:
    def __init__(self, environment, n0):
        self.n0 = float(n0)
        self.env = environment
        
        # N(s) is the number of times that state s has been visited
        # N(s,a) is the number of times that action a has been selected from state s.
        self.N = np.zeros((self.env.dealer_values_count,
                           self.env.player_values_count, 
                           self.env.actions_count))
        
        self.Q = np.zeros((self.env.dealer_values_count,
                           self.env.player_values_count, 
                           self.env.actions_count))
        # self.E = np.zeros((self.env.dealer_values_count,
        #                    self.env.player_values_count, 
        #                    self.env.actions_count))

        # Initialise the value function to zero. 
        self.V = np.zeros((self.env.dealer_values_count, self.env.player_values_count))
        
        self.count_wins = 0
        self.iterations = 0

#     def get_action(self, s):
#         a = Actions.hit
#         return a
    
        # get optimal action, with epsilon exploration (epsilon dependent on number of visits to the state)
    # ε-greedy exploration strategy with εt = N0/(N0 + N(st)), 
    def get_action(self, state):
        dealer_idx = state.dealer-1
        player_idx = state.player-1
        n_visits = sum(self.N[dealer_idx, player_idx, :])

        # epsilon = N0/(N0 + N(st)
        curr_epsilon = self.n0 / (self.n0 + n_visits)

        # epsilon greedy policy
        if random.random() < curr_epsilon:
            return Actions.hit if random.random()<0.5 else Actions.stick
        else:
            return Actions.to_action(np.argmax(self.Q[dealer_idx, player_idx, :]))

    def train(self, iterations):        
        
        # Loop episodes
        for episode in xrange(iterations):
            episode_pairs = []
            
            # get initial state for current episode
            s = self.env.get_start_state()
            
            # Execute until game ends
            while not s.term:
                
                # get action with epsilon greedy policy
                a = self.get_action(s)
                
                # store action state pairs
                episode_pairs.append((s, a))
                
                # update visits
                # N(s) is the number of times that state s has been visited
                # N(s,a) is the number of times that action a has been selected from state s. 
                self.N[s.dealer-1, s.player-1, Actions.as_int(a)] += 1
                
                # execute action
                s,r = self.env.step(s, a)

            #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, my_state.rew)
            self.count_wins = self.count_wins+1 if r==1 else self.count_wins

            # Update Action value function accordingly
            for curr_s, curr_a in episode_pairs:
                # print s.dealer, s.player, s.r, a
                dealer_idx = curr_s.dealer-1
                player_idx = curr_s.player-1
                action_idx = Actions.as_int(curr_a)
                
                # Use a time-varying scalar step-size of αt = 1/N(st,at) 
                step = 1.0 / sum(self.N[dealer_idx, player_idx, :])
                error = r - self.Q[dealer_idx, player_idx, action_idx]
                self.Q[dealer_idx, player_idx, action_idx] += step * error

        self.iterations += iterations
        print float(self.count_wins)/self.iterations*100

        # Derive value function
        for d in xrange(self.env.dealer_values_count):
            for p in xrange(self.env.player_values_count):
                self.V[d,p] = max(self.Q[d, p, :])

# TODO 
#  add missing values
#  make train so it can be ran multiple sets of itteractions
#   (and figure out when to compute the value function and to make sure its done from start of file

In [3]:
# where N0 = 100 is a constant, 
N0 = 100

In [4]:
N0 = 100
agent = MC_Agent(Environment(), N0)
for i in xrange (10):
    agent.train(50000)

50.018
50.508
50.9506666667
51.1525
51.3256
51.4363333333
51.5508571429
51.62825
51.6582222222
51.7446


In [7]:
# Feel free to choose an alternative value for N0, if it helps producing better results.

In [5]:
N0 = 300
agent = MC_Agent(Environment(), N0)
for i in xrange (10):
    agent.train(50000)

48.242
49.48
49.9646666667
50.3435
50.5864
50.7013333333
50.8582857143
50.964
51.0493333333
51.1312


In [6]:
N0 = 30
agent = MC_Agent(Environment(), N0)
for i in xrange (10):
    agent.train(50000)

49.776
50.677
51.0226666667
51.1155
51.3028
51.3463333333
51.3622857143
51.4335
51.4482222222
51.4692


In [8]:
N0 = 110
agent = MC_Agent(Environment(), N0)
for i in xrange (10):
    agent.train(50000)

49.77
50.492
50.7446666667
50.9765
51.0676
51.185
51.2651428571
51.38325
51.4633333333
51.5084


In [9]:
N0 = 90
agent = MC_Agent(Environment(), N0)
for i in xrange (10):
    agent.train(50000)

49.658
50.451
50.978
51.1715
51.2976
51.4263333333
51.5165714286
51.61475
51.6937777778
51.7638
