In [1]:
# https://github.com/Kaggle/kaggle-environments/blob/master/kaggle_environments/envs/hungry_geese/hungry_geese.py

In [7]:
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                                row_col, adjacent_positions, translate, min_distance

from kaggle_environments import make
from random import choice
import numpy as np
from copy import deepcopy
import pickle

In [49]:
import numpy as np
import random
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

from collections import deque

In [35]:
class GreedyAgent:
    def __init__(self):
        
        self.last_action = None
        self.observations = []

    def __call__(self, observation: Observation, configuration: Configuration):
        self.configuration = configuration
        
        board = np.zeros(self.configuration.rows*self.configuration.columns)
        board_shape = (self.configuration.rows, self.configuration.columns)
        
        board_heads = deepcopy(board)
        board_bodies = deepcopy(board)
        board_rewards = deepcopy(board)
        
        
        rows, columns = self.configuration.rows, self.configuration.columns

        food = observation.food
        geese = observation.geese
        
        
        opponents = [
            goose
            for index, goose in enumerate(geese)
            if index != observation.index and len(goose) > 0
        ]

        
        opponent_heads = [opponent[0] for opponent in opponents]
        # Don't move adjacent to any heads
        head_adjacent_positions = {
            opponent_head_adjacent
            for opponent_head in opponent_heads
            for opponent_head_adjacent in adjacent_positions(opponent_head, columns, rows)
        }
        
        tail_adjacent_positions ={
            opponent_tail_adjacent
            for opponent in opponents
            for opponent_tail in [opponent[-1]]
            for opponent_tail_adjacent in adjacent_positions(opponent_tail, columns, rows)
        }
        # Don't move into any bodies
        #bodies, heads = [position for goose in geese for position in goose]
        
        heads = [i[0] for i in geese if len(i)>1]
        bodies = [item for sublist in geese for item in sublist]
        
        board_bodies[list(bodies)] = 1
        board_heads[heads] = 1

        # Move to the closest food
        position = geese[observation.index][0]
        actions = {
            action: min_distance(new_position, food, columns)
            for action in Action
            for new_position in [translate(position, action, columns, rows)]
            if (
                new_position not in head_adjacent_positions and
                new_position not in bodies and
                (self.last_action is None or action != self.last_action.opposite())
            )
        }

        action = min(actions, key=actions.get) if any(actions) else choice([action for action in Action])
        
        """
        Want actions to be straight, left, or right based on last action
        """
        straight = self.last_action
        if last_action.name = "West":
            left = Action.South
            right = Action.North
        
        elif last_action.name = "North":
            left = Action.West
            right = Action.East
        
        elif last_action.name = "East":
            left = Action.North
            right = Action.South
        
        elif last_action.name = "South":
            left = Action.West
            right = Action.East
        
        cur_obs = {}
        cur_obs['head_adjacent_positions'] = head_adjacent_positions
        cur_obs['bodies'] = bodies
        cur_obs['board_bodies'] = board_bodies.reshape(board_shape)
        cur_obs['board_heads'] = board_heads.reshape(board_shape)
        cur_obs['tails'] = tail_adjacent_positions
        cur_obs['actions'] = actions
        cur_obs['action'] = action
        cur_obs['last_action'] = self.last_action
#         cur_obs['goose_size'] = player_goose_len
#         cur_obs['board'] = board
        cur_obs['cur_action'] = action
        self.observations.append(cur_obs)
        
        self.last_action = action
        return action.name


cached_greedy_agents = {}


def greedy_agent(obs, config):
    index = obs["index"]
    if index not in cached_greedy_agents:
        cached_greedy_agents[index] = GreedyAgent(Configuration(config))
    return cached_greedy_agents[index](Observation(obs))

In [171]:
class StateTranslator:
    def __init__(self):
        
        self.last_action = None
        self.observations = []
        
    def set_last_action(self, last_action):
            
        self.last_action = last_action

    def get_state(self, observation, configuration):
        self.configuration = configuration
        
        board = np.zeros(self.configuration.rows*self.configuration.columns)
        board_shape = (self.configuration.rows, self.configuration.columns)
        
        board_heads = deepcopy(board)
        board_bodies = deepcopy(board)
        board_rewards = deepcopy(board)
        
        rows, columns = self.configuration.rows, self.configuration.columns

        food = observation.food
        geese = observation.geese
        
        my_goose = geese[observation['index']]
                             
        
        opponents = [
            goose
            for index, goose in enumerate(geese)
            if index != observation.index and len(goose) > 0
        ]

        
        opponent_heads = [opponent[0] for opponent in opponents]
        # Don't move adjacent to any heads
        head_adjacent_positions = {
            opponent_head_adjacent
            for opponent_head in opponent_heads
            for opponent_head_adjacent in adjacent_positions(opponent_head, columns, rows)
        }
        
        tail_adjacent_positions ={
            opponent_tail_adjacent
            for opponent in opponents
            for opponent_tail in [opponent[-1]]
            for opponent_tail_adjacent in adjacent_positions(opponent_tail, columns, rows)
        }
        # Don't move into any bodies
        #bodies, heads = [position for goose in geese for position in goose]
        
        heads = [i[0] for i in geese if len(i)>1]
        bodies = [item for sublist in geese for item in sublist]
        
        
        board_bodies[list(bodies)] = 1
        board_heads[heads] = 1
        board_rewards[food] = 1

        
        """
        Want actions to be straight, left, or right based on last action
        """
        
        cur_obs = {}
        cur_obs['head_adjacent_positions'] = head_adjacent_positions
        cur_obs['bodies'] = bodies
        cur_obs['board_bodies'] = board_bodies.reshape(board_shape)
        cur_obs['board_heads'] = board_heads.reshape(board_shape)
        cur_obs['board_rewards'] = board_rewards.reshape(board_shape)
        cur_obs['tails'] = tail_adjacent_positions
        cur_obs['last_action'] = self.last_action

        self.observations.append(cur_obs)
        
        state = np.array([])
        state = np.append(state, my_goose)
        state = np.append(state, board_bodies)
        state = np.append(state, board_heads)
        state = np.append(state, board_rewards)
        
        return state.flatten()
    
    def translate_action_to_class(self, action):
        """
        inputs are 0, 1, 2
        """
        last_action = self.last_action
        
        action_trans = None
        straight = last_action
        
        if last_action == "WEST":
            left = Action.South
            right = Action.North
        
        elif last_action == "NORTH":
            left = Action.West
            right = Action.East
        
        elif last_action == "EAST":
            left = Action.North
            right = Action.South
        
        elif last_action == "SOUTH":
            left = Action.West
            right = Action.East

        # Convert the action given into straight, left or right
        if action ==0:
            action_trans = straight
        
        elif action == 1:
            action_trans = left
            action_trans = right
        
        return action_trans

    def translate_action_to_text(self, action):
        """
        Given an Action.whatever, return the string version as this is required for env.step 
        """
        h = {Action.North: 'NORTH',
             Action.South: 'SOUTH',
             Action.West: 'WEST',
             Action.East: 'EAST'}
        
        return h[action]

In [206]:
class dqnAgent:
    """
    Given an environment state, choose an action, and learn from the reward
    https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c
    https://towardsdatascience.com/deep-q-learning-tutorial-mindqn-2a4c855abffc
    https://www.researchgate.net/post/What-are-possible-reasons-why-Q-loss-is-not-converging-in-Deep-Q-Learning-algorithm
    """

    def __init__(self, model=None, epsilon = 1.0, epsilon_min = 0.05, frames_per_step=4):
        self.env = env

        self.StateTrans = StateTranslator()
        self.state_shape  = 1
        print('my state shape is:', self.state_shape)
        self.memory  = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = 0.990
        self.learning_rate = 0.001
        self.tau = .125

        if model == None:
            self.model = self.create_model()
        else:
            self.model = model
        self.target_model = self.create_model()

    def create_model(self):
        model   = Sequential()
        model.add(Dense(80, input_dim=self.state_shape, activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(12, activation="relu"))
        model.add(Dense(3))
        model.compile(loss="MSE",
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return random.choice(self.env.action_space)

        action_values = self.model.predict(state.reshape(-1, self.state_shape))[0]
        action = np.argmax(action_values)

        return action
    
    def translate_state(self, observation, configuration):
        state = self.StateTrans.get_state(observation, configuration)
        return state

    def __call__(self, observation, configuration):
        
        state = self.translate_state(observation, configuration)
        action = self.act(state)
        # State translator will take in 0, 1, 2 and return straight, left or right, which in turn will 
        # be translated into a kaggle Action
        action_class = self.StateTrans.translate_action_to_class(action)
        
        #Store our last action in our state translator for future reference
        self.StateTranslator.set_last_action(action_class)
        
        return action
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size:
            return

        samples = random.sample(self.memory, batch_size)
        ########################
        # This can be sped up significantly, but processing all samples in batch rather than 1 at a time
        ####################
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state.reshape(-1, self.state_shape))
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state.reshape(-1, self.state_shape))[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state.reshape(-1, self.state_shape), target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)


### Testing some env stuff

In [207]:
env = make("hungry_geese", debug=True)

In [208]:
env.reset(num_agents=4)

[{'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60,
   'step': 0,
   'geese': [[13], [68], [22], [24]],
   'food': [8, 35],
   'index': 0},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 1},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 2},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 3},
  'status': 'ACTIVE'}]

In [209]:
#env.step(['NORTH', 'SOUTH','NORTH', 'SOUTH'])
state_dict = env.step(['WEST', 'SOUTH', 'NORTH', 'SOUTH'])

#Each step returns a state dict that includes an action, reward, info, and observation (where the food and geese are stored)

In [210]:
state_dict

[{'action': 'WEST',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60,
   'step': 1,
   'geese': [[12], [2], [11], [35, 24]],
   'food': [8, 45],
   'index': 0},
  'status': 'ACTIVE'},
 {'action': 'SOUTH',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 1},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 2},
  'status': 'ACTIVE'},
 {'action': 'SOUTH',
  'reward': 202,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 3},
  'status': 'ACTIVE'}]

In [211]:
env.render(mode="ipython")

In [212]:
results = env.run([my_agent, GreedyAgent(), GreedyAgent(), GreedyAgent()])

Opposite action: (1, <Action.NORTH: 1>, <Action.SOUTH: 3>)
Opposite action: (2, <Action.SOUTH: 3>, <Action.NORTH: 1>)
Body Hit: (0, <Action.SOUTH: 3>, 51, [40, 41, 42, 31, 20, 19, 30, 29, 28, 39, 50, 51, 62, 73])


In [213]:
env.render(mode="ipython")

### Test StateTranslator

In [214]:
st_test = StateTranslator()

In [215]:
env = make("hungry_geese", debug=True)
config = env.configuration

state_dict = env.reset(num_agents=4)[0]
observation = state_dict['observation']
action = state_dict['action']


In [216]:
action

'NORTH'

In [217]:
st_test.set_last_action(action)

In [218]:
state_test = st_test.get_state(observation, config)
state_test

array([62.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [219]:
assert len(state_test) == 11*7*3 + 1

### Its so fucking annoying. I need the agents to return Action.Whatever when called, I need to use a string when using env.step, and the output of my neural net is naturally 0, 1, 2, and the action returned by step is a string

#### I need to revisit how I am storing my states for future. I may need to include a last action item so that the dqn is able to understand what the 0, 1, 2 are for

In [220]:
st_test.translate_action_to_class(0)

AttributeError: 'str' object has no attribute 'name'

### Training script

In [221]:
steps_per_ep = 200
num_episodes = 1000



dqn = dqnAgent()
agent2 = GreedyAgent()
agent3 = GreedyAgent()
agent4 = GreedyAgent()

agents = [dqn, agent2, agent3, agent4]

results_dic = {}
for ep in range(num_episodes):
    state_dict = env.reset(num_agents=4)[0]
    observation = state_dict['observation']
    reward = state_dict['reward']
    action = state_dict['action']
    done = state_dict['status']
    
    cur_state = dqn.StateTrans.get_state(observation, config)
    
    dqn.StateTrans.set_last_action(action)

    for step in range(steps_per_ep):
        actions = []
        for agent in agents:
            action = agent(observation, config)
            action_trans = dqn.StateTrans.translate_action_to_text(action)
            actions.append(action_trans)
        
        state_dict = env.step(actions)
        observation = state_dict['observation']
        reward = state_dict['reward']
        action = state_dict['action']
        done = state_dict['status']
        new_state = dqn.StateTrans.get_state(observation, config)
        
        """
        Required changes:
        1) action needs to be recorded as 0, 1, 2
        2) Need a state input to represent last action taken ( one hot vector of 4 for each cardinal dir). 
        This changes the meaning of what 0, 1, 2 actually do since they represent straight, left, right.
        """
        dqn.remember(cur_state, action, reward, new_state, done)

        cur_state = new_state
        
        dqn.replay()        
        dqn.target_train()
        
        if step%50 == 0:
            directory = "first_agent"
            dqn_agent.save_model(direct + f"/trial-{ep}")
        
        if done:
            print('Done, Step: ', step)
            results_dic[ep] = reward
            directory = "first_agent"
            dqn_agent.save_model(direct + f"/trial-{ep}")
            with open(direct + "/results_dic.pkl", 'wb') as f:
                pickle.dump(results_dic, f)
            break

my state shape is: 1


AttributeError: 'Environment' object has no attribute 'action_space'