https://www.youtube.com/watch?v=IS0V8z8HXrM&list=PL-9x0_FO_lgkwi8ES611NsV-cjYaH_nLa&index=2

In [1]:
from YambEnv import ROW, COL, YambEnv, Action
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow.keras.backend as K
import numpy as np
from functools import lru_cache

In [2]:
# First of all we need to be able to enumerate all the arrays of size 6 we can have which sum to 5
# Generalize by n different buckets throw in 5 balls
@lru_cache(maxsize=10)
def dp(n, k):
    result = set()
    if k==0:
        result.add((0,)*n)
        return result
    
    recursive_result = dp(n, k-1)
    for tup in recursive_result:
        for i in range(n):
            new_arr = list(tup)
            new_arr[i] += 1
            result.add(tuple(new_arr))
            
    return result


In [3]:
# list of all possible count arrays of dice we can keep
COUNT_ARRAYS = list(reversed(sorted(list(dp(6, 5)))))

In [45]:
class Agent(object):
    def __init__(self, agent_type : int):
        assert 1 <= agent_type <= 3, "Agent must be of type 1, 2 or 3"
        self.agent_type = agent_type
        self.discount_rate = 0.99
        self.learning_rate = 0.01
        self.input_dim, self.output_dim = self._get_dims()
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.policy, self.predict = self._build_policy_network()
        self.action_space = [i for i in range(self.output_dim)]
        self.model_file = 'agent_{}'.format(agent_type)
        
    def _get_dims(self):
        """Get the dimensions of the input to the network and the output to the network
        :return: input_dim, output_dim
        """
        grid_dim = 14 * 4
        announced_dim = 15 # first 14 tell us which row we announced, 15th tells us we did not announce
        roll_input_dim = len(COUNT_ARRAYS)
        roll_output_dim = len(COUNT_ARRAYS)  # will give an array of probabilities. each is mapped to different count array
        
        if self.agent_type == 1:
            return grid_dim + roll_input_dim, roll_output_dim * announced_dim
        
        if self.agent_type == 2:
            return grid_dim + roll_input_dim + announced_dim, roll_output_dim
        
        if self.agent_type == 3:
            return grid_dim + roll_input_dim + announced_dim, grid_dim
        
    def _convert_observation_to_input(self, observation : dict):
        """
        :param observation: dictionary which comes from the environment
        :return: numpy array in the appropriate format to be consumed by the model as an input
        """
        assert self.agent_type == observation["roll_number"], "Agent type should match roll number"
        grid = np.nan_to_num(observation["grid"].flatten() / 100, nan=-1)
        roll = np.eye(len(COUNT_ARRAYS))[COUNT_ARRAYS.index( tuple(observation["roll"]) )]
        announced = np.eye(15)[observation["announced_row"] if observation["announced"] else 14]
        
        if self.agent_type == 1:
            return np.hstack([grid, roll])
        
        if self.agent_type == 2:
            return np.hstack([grid, roll, announced])
        
        if self.agent_type == 3:
            return np.hstack([grid, roll, announced])
        
    def _convert_action_to_output(self):
        raise NotImplementedError
        
    def _build_policy_network(self):
        inputs = Input(shape=(self.input_dim,))
        dense = Dense(100, activation='relu')(inputs)
        probs = Dense(self.output_dim, activation='softmax')(dense)
        # need to understand how to setup model so that it can do policy gradient
        policy = Model(inputs=inputs, outputs=probs, name='policy_network_1')
        policy.compile(optimizer=Adam(learning_rate=self.learning_rate), loss=SparseCategoricalCrossentropy(from_logits=True))
        predict = Model(inputs=inputs, outputs=probs)
        return policy, predict

    def choose_action(self, observation):
        """
        :param observation: observation from the environment -> needs to convert to correct format before network uses
        :return: needs to return an Action
        """
        state = self._convert_to_
        probabilities = self.predict.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)
        action = self._convert_action_to_output()
        return action

    def store_transition(self, observation, action, reward):
        self.state_memory.append(observation)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def learn(self):
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        actions = np.zeros([len(action_memory), self.n_actions])
        actions[np.arange(len(action_memory)), action_memory] = 1

        G = np.zeros_like(reward_memory)
        for t in range(len(reward_memory)):
            G_sum = 0
            discount = 1
            for k in range(t, len(reward_memory)):
                G_sum += reward_memory[k] * discount
                discount *= self.gamma
            G[t] = G_sum
        mean = np.mean(G)
        std = np.std(G) if np.std(G) > 0 else 1
        self.G = (G - mean) / std

        cost = self.policy.train_on_batch([state_memory, self.G], actions)

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

        return cost

    def save_model(self):
        self.policy.save(self.model_file)

    def load_model(self):
        self.policy = load_model(self.model_file)

In [5]:
# env = YambEnv()
# n_episodes = 1
# for i in range(n_episodes):
#     observation = env.reset()
#     truncated, terminated = False, False
#     score = 0
#     while not(terminated or truncated):
#         action = agent.choose_action(observation)
#         observation_new, reward, terminated, truncated, truncation_reason = env.step(action)
#         agent.store_transition(observation, action, reward)
#         observation = observation_new
#         score += reward
        
#     score_history.append(score)
#     agent.learn()
#     print("Episode: {}, Score: {}, Average score: {}".format(i, score, sum(score_history[-100:]) / 100.0))
        
    

In [34]:
env = YambEnv()
observation = env.reset()

array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0