This stuff is because it needs to handle the relative import of the `yamb` package properly

In [1]:
import sys
import os

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# All about that space?

I think you can use any observation space, but the action space cannot be a dict or a tuple. `Box` space might be better as there seems to be more algorithms that support it. Though `MultiDiscrete` seems to be easier to describe.

In [2]:
import numpy as np
from gymnasium import spaces

In [3]:
# keep: np.array # action type 1 and 2, array of length 6 saying which dice we keep
# announce: bool = False # roll_number / action type 1
# announce_row: ROW = ROW.YAMB # roll_number / action type 1
# row_to_fill: ROW = ROW.YAMB # roll_number / action type 3 
# col_to_fill: COL = COL.DOLJE # roll_number / action type 3

# the action space can't be a tuple or dictionary, which is tricky
number_of_ones_to_keep_range = {"low" : 0, "high": 5}
number_of_twos_to_keep_range = {"low" : 0, "high": 5}
number_of_threes_to_keep_range = {"low" : 0, "high": 5}
number_of_fours_to_keep_range = {"low" : 0, "high": 5}
number_of_fives_to_keep_range = {"low" : 0, "high": 5}
number_of_sixes_to_keep_range = {"low" : 0, "high": 5}
announce_range = {"low" : 0, "high": 1}
announce_row_range = {"low" : 0, "high": 13}
row_to_fill_range = {"low": 0, "high": 13}
col_to_fill_range = {"low": 0, "high": 3}

low = np.array(
[
    number_of_ones_to_keep_range["low"],
    number_of_twos_to_keep_range["low"],
    number_of_threes_to_keep_range["low"],
    number_of_fours_to_keep_range["low"],
    number_of_fives_to_keep_range["low"],
    number_of_sixes_to_keep_range["low"],
    announce_range["low"],
    announce_row_range["low"],
    row_to_fill_range["low"],
    col_to_fill_range["low"],
]
)

high = np.array(
[
    number_of_ones_to_keep_range["high"],
    number_of_twos_to_keep_range["high"],
    number_of_threes_to_keep_range["high"],
    number_of_fours_to_keep_range["high"],
    number_of_fives_to_keep_range["high"],
    number_of_sixes_to_keep_range["high"],
    announce_range["high"],
    announce_row_range["high"],
    row_to_fill_range["high"],
    col_to_fill_range["high"],
]
)

action_space = spaces.Box(low=low, high=high, dtype=int)
action_space.sample()

array([4, 0, 4, 3, 3, 2, 0, 9, 7, 3])

In [4]:
action_space = spaces.MultiDiscrete(np.array([6, 6, 6, 6, 6, 6, 2, 14, 14, 4]))
action_space.sample()

array([1, 0, 2, 3, 4, 2, 0, 2, 4, 0], dtype=int64)

In [5]:
num1s = np.array([1, 1, 1, 0, 0, 0], dtype=np.int8)
num2s = np.array([1, 1, 1, 0, 0, 0], dtype=np.int8)
num3s = np.array([1, 1, 1, 0, 0, 0], dtype=np.int8)
num4s = np.array([1, 1, 1, 0, 0, 0], dtype=np.int8)
num5s = np.array([1, 1, 1, 0, 0, 0], dtype=np.int8)
num6s = np.array([1, 1, 1, 0, 0, 0], dtype=np.int8)
announce = np.array([1, 0], dtype=np.int8)
announce_row = np.array([1]*14, dtype=np.int8)
row_to_fill = np.array([1] + 13 * [0], dtype=np.int8)
col_to_fill = np.array([1, 0, 0, 0], dtype=np.int8)
mask = (num1s, num2s, num3s, num4s, num5s, num6s, announce, announce_row, row_to_fill, col_to_fill)
action_space.sample(mask=mask)

array([ 1,  2,  2,  2,  2,  1,  0, 10,  0,  0], dtype=int64)

In [6]:
spaces.Dict({
            "turn_number": spaces.Discrete(14*3,start=0),
            "roll_number": spaces.Discrete(3,start=0),
            "grid": spaces.Box(low=-145, high=145, shape=(14, 3), dtype=int),
            "roll": spaces.Box(low=0, high=5, shape=(6,), dtype=int),
            "announced": spaces.Discrete(2,start=0),
            "announced_row": spaces.Discrete(14, start=0),
}).sample()
        

OrderedDict([('announced', 0),
             ('announced_row', 13),
             ('grid',
              array([[ 133,  -99,   15],
                     [  15,  -66,  -10],
                     [ 114, -133,   10],
                     [  92,  -30,   33],
                     [ -82,    4,   95],
                     [  37,  140,  100],
                     [-119,  -30,  -14],
                     [  79,  -77,  -74],
                     [-114,   75,   67],
                     [  52, -124,   55],
                     [ -78,   99,    8],
                     [  39,  -37,   54],
                     [ 108,  -47,   86],
                     [ -61,  -70, -105]])),
             ('roll', array([3, 1, 0, 2, 4, 4])),
             ('roll_number', 2),
             ('turn_number', 35)])

# Is el GPU available?

In [7]:
import torch
print(torch.cuda.is_available())

False


# Reward shaping

In [8]:
from gymnasium import ObservationWrapper, RewardWrapper
from yamb import YambEnv

class AddStepsToReward(RewardWrapper):
    def __init__(self, env):
        super().__init__(env)

    def reward(self, reward):
        return 100*(self.unwrapped.turn_number*3 + self.unwrapped.roll_number) + reward / 1000.0
    
env = YambEnv()
env = AddStepsToReward(env)

