In [11]:
import numpy as np

# Value Loss

In [25]:
def r_gamma(rewards: np.array, gamma):
    r_gamma = 0
    for result in rewards[:0:-1]:  # Slicing to reverse except the first element
        r_gamma = gamma * (r_gamma + result)
    r_gamma += rewards[0]
    return r_gamma


def value_function(state): #TODO: implement this
    return 160

def value_loss(r_gamma, state, deltas):
    return (np.clip(r_gamma, -deltas[1], deltas[2])-value_function(state))**2

# Policy Loss

In [14]:
def ratio(old_policy, new_policy, action, state):
    return new_policy(action, state) / old_policy(action, state)

def a_gae(results, states, value_function: callable, gamma, lambda_):
    def a_k(k):
        a_k = - value_function(states[0]) # First state of the hand
        for i in range(k):
            a_k += gamma**(i) * results[i]
        a_k += gamma**(k) * value_function(states[k])
        return a_k
    a_ks = [lambda_**(k_minus_one) * a_k(k_minus_one+1) for k_minus_one in range(len(results)-2)]
    return (1 - lambda_) * np.sum(a_ks)
    
def tc_loss_function(ratio, advantage, epsilon, deltas): #We compute this for every hand and then average it
    return np.clip(ratio, np.clip(ratio, 1 - epsilon, 1 + epsilon), deltas[1]) * advantage
    

In [26]:
rewards = np.array([-20, 100, 0, -100, 320])
deltas = [1, 160, 160]
print(r_gamma(rewards, 0.999))
print("------------")
print(value_loss(r_gamma(rewards, 0.999), 0, deltas))

298.92161882031996
------------
0.0


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ValueNetwork(nn.Module):
    def __init__(self, nb=9, card_channels=6, action_channels=24):
        super(ValueNetwork, self).__init__()
        
        # Convolution layers for card tensor
        self.card_conv = nn.Sequential(
            nn.Conv2d(card_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))  # reduce to a single feature vector
        )
        
        # Convolution layers for action tensor
        self.action_conv = nn.Sequential(
            nn.Conv2d(action_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        
        # Fully connected layers after concatenation
        # The output of card_conv and action_conv each is [batch, 64, 1, 1] => 64 features
        self.fc = nn.Sequential(
            nn.Linear(64 + 64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)  # single scalar value
        )
    
    def forward(self, card_tensor, action_tensor):
        # card_tensor: [batch, 6, 4, 13]
        # action_tensor: [batch, 24, 4, nb]
        card_features = self.card_conv(card_tensor)       # [batch, 64, 1, 1]
        action_features = self.action_conv(action_tensor) # [batch, 64, 1, 1]
        
        # Flatten
        card_features = card_features.view(card_features.size(0), -1)
        action_features = action_features.view(action_features.size(0), -1)
        
        # Concatenate and pass through FC
        combined = torch.cat([card_features, action_features], dim=1)
        value = self.fc(combined)
        return value.squeeze(-1)  # [batch]

# Example instantiation
value_net = ValueNetwork(nb=9, card_channels=6, action_channels=24)

# In your code, you'd replace this stub:
def value_function(state):
    # Extract card_tensor and action_tensor from state
    # state is expected to have something like:
    # state['card_tensor']: torch.FloatTensor of shape [1, 6, 4, 13]
    # state['action_tensor']: torch.FloatTensor of shape [1, 24, 4, nb]
    card_tensor = state['card_tensor']
    action_tensor = state['action_tensor']
    
    # Forward pass
    with torch.no_grad():
        val = value_net(card_tensor, action_tensor)
    return val.item()  # return as a scalar
