In [1]:
import numpy as np

# Value Loss

In [2]:
def r_gamma(rewards: np.array, gamma):
    r_gamma = 0
    for reward in rewards[:0:-1]:  # Slicing to reverse except the first element
        r_gamma = gamma * (r_gamma + reward)
        print(reward)
    r_gamma += rewards[0]
    return r_gamma

def value_function(state): #TODO: implement this
    return 0

def v_loss(r_gamma, state, deltas):
    return (np.clip(r_gamma, -deltas[1], deltas[2])-value_function(state))**2

# Policy Loss

In [49]:
def policy(state):
    if state == 'Preflop':
        # return np.array([0.5, 0.5, 0, 0, 0])
        return np.array([0, 1, 0, 0, 0])
    elif state == 'Flop':
        # return np.array([0, 0.5, 0.5, 0, 0])
        return np.array([0, 0, 1, 0, 0])
    elif state == 'Turn':
        # return np.array([0, 0, 1/3, 1/3, 1/3])
        return np.array([0, 0, 0, 1, 0])
    elif state == 'River':
        # return np.array([0, 1/2, 1/2, 0, 0])
        return np.array([0, 0, 1, 0, 0])
    return 0

def get_deltas(state): # I'm not sure wether the deltas should add the chips from previous states or just the current state
    delta1 = 3
    if state == 'Preflop':
        delta2 = 20
        delta3 = 10 # The opponent put in the big blind and the agent just betted. The opponnent hasnt put in any chips yet.
    elif state == 'Flop':
        delta2 = 40
        delta3 = 20
    elif state == 'Turn':
        delta2 = 120
        delta3 = 80
    elif state == 'River':
        delta2 = 120
        delta3 = 120
    return delta1, delta2, delta3

def ratio(old_policy, new_policy, action, state):
    return new_policy(state)[action] / old_policy(state)[action]

def a_gae(results, states, value_function, gamma, lambda_):
    """
    Generalized Advantage Estimator (GAE) where:
      - len(states) == len(results)
      - We do NOT assume an extra 'terminal state' beyond these states.
    
    results:       list/array of rewards at each timestep
    states:        list/array of states at each timestep
    value_function: function that takes a state and returns a scalar value
    gamma:         discount factor
    lambda_:       GAE parameter
    """
    N = len(results)
    if N == 0:
        return 0.0
    
    # For convenience, compute V(s0) once
    v0 = value_function(states[0])
    
    # --------------------------------------------------------
    # 1) Precompute partial sums of discounted rewards:
    #    S[k] = sum_{i=0..k-1} gamma^i * results[i], with S[0] = 0
    #
    #    Then the "raw" advantage term (before weighting by λ^(k-1)) is:
    #       a_k = - V(s0) + S[k] + gamma^k * V(sk),
    #    for k in 1..N-1 (because states[k] must be valid).
    # --------------------------------------------------------
    S = np.zeros(N+1, dtype=float)
    for i in range(N):
        S[i+1] = S[i] + (gamma ** i) * results[i]

    # --------------------------------------------------------
    # 2) Accumulate the GAE sum:
    #
    #    A = (1 - λ) * Σ (λ^(k-1) * a_k), for k = 1..N-1
    #
    #    We use k=1..N-1 so that states[k] is still in range.
    # --------------------------------------------------------
    gae_sum = 0.0
    for k in range(1, N):
        a_k = -v0 + S[k] + (gamma ** k) * value_function(states[k])
        gae_sum += (lambda_ ** (k - 1)) * a_k
    
    return (1 - lambda_) * gae_sum

# I wasn't sure how to treat the showdown state. The approach I am following is when the only states that are fed to the a_gae function are the river and the showdown, the resulting a_k() is 
# -V(river) + r(river) + V(showdown). I think the values for the river state and the showdown state will be different because the showdown value depends on the amount of chips that the agent has played
# in the river state.
    
def tc_loss_function(ratio, advantage, epsilon, deltas): #We compute this for every hand and then average it
    return np.clip(ratio, np.clip(ratio, 1 - epsilon, 1 + epsilon), deltas[0]) * advantage
    

In [4]:
def get_action(policy: callable, state):
    return np.random.choice(len(policy(state)), p=policy(state))

In [5]:
rewards = np.array([-20, -40, 0, -100, 320])
deltas = [3, 160, 160]
print(r_gamma(rewards, 0.999))
print(v_loss(r_gamma(rewards, 0.999), 0, deltas))

320
-100
0
-40
159.06161882032
320
-100
0
-40
25300.598581740778


In [33]:
def get_losses(states, rewards, policy, value_function):
    tc_loss = 0
    value_loss = 0
    states_without_showdown = states[:-1]
    for i, state in enumerate(states_without_showdown):
        deltas = get_deltas(state)
        rewards_from_now = rewards[i:]
        states_from_now = states[i:]
        advantage = a_gae(rewards_from_now, states_from_now, value_function, 0.999, 0.99) #I'm not sure if this is correct
        action = get_action(policy, state)
        old_policy = policy
        new_policy = policy
        r = ratio(old_policy, new_policy, action, state)
        tc_loss += tc_loss_function(r, advantage, 0.2, deltas)
        value_loss += v_loss(r_gamma(rewards_from_now, 0.999), 0, deltas)

    tc_loss /= len(states_without_showdown)
    value_loss /= len(states_without_showdown)
    return tc_loss, value_loss

In [45]:
states = ['Preflop', 'Flop', 'Turn', 'River', 'Showdown']
rewards = [-20, -20, -80, 0, 240] #There should be one more reward than states
tc_loss = 0
value_loss = 0



states_without_showdown = states[:-1]
for i, state in enumerate(states_without_showdown):
    deltas = get_deltas(state)
    print(states[i:])
    rewards_from_now = rewards[i:]
    states_from_now = states[i:]
    advantage = a_gae(rewards_from_now, states_from_now, value_function, 0.999, 0.99) #I'm not sure if this is correct
    print('Advantage: ', advantage)
    action = get_action(policy, state)
    old_policy = policy
    new_policy = policy
    r = ratio(old_policy, new_policy, action, state)
    print(deltas[1], deltas[2])
    tc_loss += tc_loss_function(r, advantage, 0.2, deltas)
    value_loss += v_loss(r_gamma(rewards_from_now, 0.999), 0, deltas)

tc_loss /= len(states_without_showdown)
value_loss /= len(states_without_showdown)
print('TC loss: ', tc_loss)
print('Value loss: ', value_loss)

['Preflop', 'Flop', 'Turn', 'River', 'Showdown']
1 [-20, -20, -80, 0, 240]
1 -20.0
2 [-20, -20, -80, 0, 240]
2 -39.980000000000004
3 [-20, -20, -80, 0, 240]
3 -119.82008
4 [-20, -20, -80, 0, 240]
4 -119.82008
Advantage:  -2.932771642119203
20 10
240
0
-80
-20
['Flop', 'Turn', 'River', 'Showdown']
1 [-20, -80, 0, 240]
1 -20.0
2 [-20, -80, 0, 240]
2 -99.92
3 [-20, -80, 0, 240]
3 -99.92
Advantage:  -2.168523920000002
40 20
240
0
-80
['Turn', 'River', 'Showdown']
1 [-80, 0, 240]
1 -80.0
2 [-80, 0, 240]
2 -80.0
Advantage:  -1.5920000000000012
120 80
240
0
['River', 'Showdown']
1 [0, 240]
1 0.0
Advantage:  0.0
120 120
240
TC loss:  -1.6733238905298016
Value loss:  5325.0


In [50]:
states = ['Preflop', 'Flop', 'Turn', 'River', 'Showdown']
rewards = [-20, -20, -80, 0, 240] #There should be one more reward than states
tc_loss = 0
value_loss = 0



states_without_showdown = states[:-1]
for i, state in enumerate(states_without_showdown):
    deltas = get_deltas(state)
    print(states[i:])
    rewards_from_now = rewards[i:]
    states_from_now = states[i:]
    advantage = a_gae(rewards_from_now, states_from_now, value_function, 0.999, 0.99) #I'm not sure if this is correct
    print('Advantage: ', advantage)
    action = get_action(policy, state)
    old_policy = policy
    new_policy = policy
    r = ratio(old_policy, new_policy, action, state)
    print(deltas[1], deltas[2])
    tc_loss += tc_loss_function(r, advantage, 0.2, deltas)
    value_loss += v_loss(r_gamma(rewards_from_now, 0.999), 0, deltas)

tc_loss /= len(states_without_showdown)
value_loss /= len(states_without_showdown)
print('TC loss: ', tc_loss)
print('Value loss: ', value_loss)

['Preflop', 'Flop', 'Turn', 'River', 'Showdown']
Advantage:  -2.932771642119203
20 10
240
0
-80
-20
['Flop', 'Turn', 'River', 'Showdown']
Advantage:  -2.168523920000002
40 20
240
0
-80
['Turn', 'River', 'Showdown']
Advantage:  -1.5920000000000012
120 80
240
0
['River', 'Showdown']
Advantage:  0.0
120 120
240
TC loss:  -1.6733238905298016
Value loss:  5325.0


In [41]:
print(get_losses(states, rewards, policy, value_function))

1 [-20, -20, -80, 0, 240]
2 [-20, -20, -80, 0, 240]
3 [-20, -20, -80, 0, 240]
4 [-20, -20, -80, 0, 240]
240
0
-80
-20
1 [-20, -80, 0, 240]
2 [-20, -80, 0, 240]
3 [-20, -80, 0, 240]
240
0
-80
1 [-80, 0, 240]
2 [-80, 0, 240]
240
0
1 [0, 240]
240
(np.float64(-0.9304432345098008), np.float64(5325.0))
