In [2]:
from env import *
from utils import *

In [3]:
dealer_intervals = np.array([[1, 4], [4, 7], [7, 10]])
player_intervals = np.array([[1, 6], [4, 9], [7, 12], [10, 15], [13, 18], [16, 21]])
actions = np.array([0, 1])


def initialize_weights_or_features(value=0):
  return np.full((dealer_intervals.shape[0], player_intervals.shape[0], actions.shape[0]), value, dtype=float)


def find_intervals(sum_value, intervals):
    found_intervals = []

    for i in range(len(intervals)):
        if intervals[i][0] <= sum_value <= intervals[i][1]:
            found_intervals.append(i)

    return found_intervals

In [4]:
def approx_Q(state, action, weights, features):
  dealer_sum, player_sum = state
  found_dealer_intervals = find_intervals(dealer_sum, dealer_intervals)
  found_player_intervals = find_intervals(player_sum, player_intervals)

  value = 0
  for i in found_dealer_intervals:
    for j in found_player_intervals:
      value += weights[i][j][action] * features[i][j][action]

  return value

In [5]:
def epsilon_greedy(state, weights, features):
  r = random.random()
  if r < 0.05:
    return random.choice([action.value for action in Action])
  q0 = approx_Q(state, 0, weights, features)
  q1 = approx_Q(state, 1, weights, features)
  return 0 if q0 > q1 else 1  # could choose randomly here if they are equal

In [6]:
def phi(state, action):
  dealer_sum, player_sum = state
  found_dealer_intervals = find_intervals(dealer_sum, dealer_intervals)
  found_player_intervals = find_intervals(player_sum, player_intervals)

  features = np.zeros((3, 6, 2), dtype=int)

  for i in found_dealer_intervals:
    for j in found_player_intervals:
      features[i, j, action] = 1

  return features

In [10]:
def sarsa_lambda(env: Env, _lambda=0, gamma=1,num_episodes=1000, N_0=100, show_progress=True):
  features = initialize_weights_or_features()
  weights = initialize_weights_or_features()
  N_s = {}
  N_sa = {}
  mse = []
  wins = 0
  alpha = 0.01

  for i in range(num_episodes):
    e = initialize_weights_or_features()
    dealer_sum = NewCard(firstCard=True).get_value()
    player_sum = NewCard(firstCard=True).get_value()
    terminated = False
    state = (dealer_sum, player_sum)
    action = int(epsilon_greedy(state, weights, features))    
    current_Q = approx_Q(state, action, weights, features) 

    while not terminated:
      N_s[state] = N_s.get(state, 0) + 1
      N_sa[(state, action)] = N_sa.get((state, action), 0) + 1

      dealer_sum, player_sum, reward, terminated = env.step(
        dealer_sum, player_sum, action
      )
      wins += reward == 1
      
      new_state = (dealer_sum, player_sum)
      new_action = epsilon_greedy(state, weights, features)
      if terminated:
        delta = reward - gamma * current_Q
      else:
        next_Q = approx_Q(new_state, new_action, weights, features) 
        delta = reward + gamma * next_Q - current_Q
      
      e += phi(state, action)
      weights += alpha * delta * e
      weights *= _lambda * gamma
      state = new_state
      action = new_action

    if not show_progress:
      continue

    if i % (num_episodes / 10) == 0 and i > 0:
      print("Episode: %d, score: %f" % (i, (float(wins) / i * 100.0)))

In [11]:
env = Env()

In [16]:
sarsa_lambda(env, _lambda=0.5, num_episodes=10000)

Episode: 1000, score: 52.300000
Episode: 2000, score: 53.200000
Episode: 3000, score: 53.466667
Episode: 4000, score: 53.250000
Episode: 5000, score: 52.960000
Episode: 6000, score: 52.400000
Episode: 7000, score: 52.414286
Episode: 8000, score: 52.500000
Episode: 9000, score: 52.633333
