In [1]:
# https://github.com/Kaggle/kaggle-environments/blob/master/kaggle_environments/envs/hungry_geese/hungry_geese.py

In [2]:
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                                row_col, adjacent_positions, translate, min_distance

from kaggle_environments import make
from random import choice
import numpy as np
from copy import deepcopy
import pickle

Loading environment football failed: No module named 'gfootball'


In [3]:
import numpy as np
import random
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

from collections import deque

In [4]:
class GreedyAgent:
    def __init__(self):
        
        self.last_action = None
        self.observations = []

    def __call__(self, observation: Observation, configuration: Configuration):
        self.configuration = configuration
        
        board = np.zeros(self.configuration.rows*self.configuration.columns)
        board_shape = (self.configuration.rows, self.configuration.columns)
        
        board_heads = deepcopy(board)
        board_bodies = deepcopy(board)
        board_rewards = deepcopy(board)
        
        
        rows, columns = self.configuration.rows, self.configuration.columns

        food = observation.food
        geese = observation.geese
        
        
        opponents = [
            goose
            for index, goose in enumerate(geese)
            if index != observation.index and len(goose) > 0
        ]

        
        opponent_heads = [opponent[0] for opponent in opponents]
        # Don't move adjacent to any heads
        head_adjacent_positions = {
            opponent_head_adjacent
            for opponent_head in opponent_heads
            for opponent_head_adjacent in adjacent_positions(opponent_head, columns, rows)
        }
        
        tail_adjacent_positions ={
            opponent_tail_adjacent
            for opponent in opponents
            for opponent_tail in [opponent[-1]]
            for opponent_tail_adjacent in adjacent_positions(opponent_tail, columns, rows)
        }
        # Don't move into any bodies
        #bodies, heads = [position for goose in geese for position in goose]
        
        heads = [i[0] for i in geese if len(i)>1]
        bodies = [item for sublist in geese for item in sublist]
        
        board_bodies[list(bodies)] = 1
        board_heads[heads] = 1

        # Move to the closest food
        position = geese[observation.index][0]
        actions = {
            action: min_distance(new_position, food, columns)
            for action in Action
            for new_position in [translate(position, action, columns, rows)]
            if (
                new_position not in head_adjacent_positions and
                new_position not in bodies and
                (self.last_action is None or action != self.last_action.opposite())
            )
        }

        action = min(actions, key=actions.get) if any(actions) else choice([action for action in Action])
        
        """
        Want actions to be straight, left, or right based on last action
        """
        
        cur_obs = {}
        cur_obs['head_adjacent_positions'] = head_adjacent_positions
        cur_obs['bodies'] = bodies
        cur_obs['board_bodies'] = board_bodies.reshape(board_shape)
        cur_obs['board_heads'] = board_heads.reshape(board_shape)
        cur_obs['tails'] = tail_adjacent_positions
        cur_obs['actions'] = actions
        cur_obs['action'] = action
        cur_obs['last_action'] = self.last_action
#         cur_obs['goose_size'] = player_goose_len
#         cur_obs['board'] = board
        cur_obs['cur_action'] = action
        self.observations.append(cur_obs)
        
        self.last_action = action
        return action.name


cached_greedy_agents = {}


def greedy_agent(obs, config):
    index = obs["index"]
    if index not in cached_greedy_agents:
        cached_greedy_agents[index] = GreedyAgent(Configuration(config))
    return cached_greedy_agents[index](Observation(obs))

In [5]:
class StateTranslator:
    def __init__(self):
        
        self.last_action = None
        self.observations = []
        
    def set_last_action(self, last_action):
        self.last_action = last_action
        
    def __get_last_action_vec(self):
        action_vec = np.zeros(4)
        
        if self.last_action == 'NORTH':
            action_vec[0] = 1
        elif self.last_action == 'SOUTH':
            action_vec[1] == 1
        elif self.last_action == 'EAST':
            action_vec[2] == 1
        elif self.last_action == 'WEST':
            action_vec[3] == 1
        
        return action_vec

    def get_state(self, observation, configuration):
        self.configuration = configuration
        
        board = np.zeros(self.configuration.rows*self.configuration.columns)
        board_shape = (self.configuration.rows, self.configuration.columns)
        
        board_heads = deepcopy(board)
        board_bodies = deepcopy(board)
        board_rewards = deepcopy(board)
        
        rows, columns = self.configuration.rows, self.configuration.columns

        food = observation.food
        geese = observation.geese
        
        my_goose = np.array(geese[observation['index']][0])
                             
        
        opponents = [
            goose
            for index, goose in enumerate(geese)
            if index != observation.index and len(goose) > 0
        ]

        
        opponent_heads = [opponent[0] for opponent in opponents]
        # Don't move adjacent to any heads
        head_adjacent_positions = {
            opponent_head_adjacent
            for opponent_head in opponent_heads
            for opponent_head_adjacent in adjacent_positions(opponent_head, columns, rows)
        }
        
        tail_adjacent_positions ={
            opponent_tail_adjacent
            for opponent in opponents
            for opponent_tail in [opponent[-1]]
            for opponent_tail_adjacent in adjacent_positions(opponent_tail, columns, rows)
        }
        # Don't move into any bodies
        #bodies, heads = [position for goose in geese for position in goose]
        
        heads = [i[0] for i in geese if len(i)>1]
        bodies = [item for sublist in geese for item in sublist]
        
        
        board_bodies[list(bodies)] = 1
        board_heads[heads] = 1
        board_rewards[food] = 1

        
        """
        Want actions to be straight, left, or right based on last action
        """
        
        cur_obs = {}
        cur_obs['head_adjacent_positions'] = head_adjacent_positions
        cur_obs['bodies'] = bodies
        cur_obs['board_bodies'] = board_bodies.reshape(board_shape)
        cur_obs['board_heads'] = board_heads.reshape(board_shape)
        cur_obs['board_rewards'] = board_rewards.reshape(board_shape)
        cur_obs['tails'] = tail_adjacent_positions
        cur_obs['last_action'] = self.last_action

        self.observations.append(cur_obs)
        
        state = np.array([])
        state = np.append(state, self.__get_last_action_vec())
        state = np.append(state, my_goose/(7*11)) # Scale this down 
        state = np.append(state, board_bodies)
        state = np.append(state, board_heads)
        state = np.append(state, board_rewards)
        
        return state.flatten()
    
    def translate_action_to_class(self, action):
        """
        inputs are 0, 1, 2
        """
        last_action = self.last_action
        
        action_trans = None
        
        if last_action == "WEST":
            left = Action.SOUTH
            right = Action.NORTH
            straight = Action.WEST
            
        elif last_action == "NORTH":
            left = Action.WEST
            right = Action.EAST
            straight = Action.NORTH
        
        elif last_action == "EAST":
            left = Action.NORTH
            right = Action.SOUTH
            straight = Action.EAST
        
        elif last_action == "SOUTH":
            left = Action.WEST
            right = Action.EAST
            straight = Action.SOUTH

        # Convert the action given into straight, left or right
        if action == 0:
            action_trans = straight
        
        elif action == 1:
            action_trans = left
        
        elif action == 2:
            action_trans = right
        
        return action_trans

    def translate_action_to_text(self, action):
        """
        Given an Action.whatever, return the string version as this is required for env.step 
        """
        h = {Action.NORTH: 'NORTH',
             Action.SOUTH: 'SOUTH',
             Action.WEST: 'WEST',
             Action.EAST: 'EAST'}
        
        return h[action]
    
    def translate_text_to_int(self, action):
        
        last_action = self.last_action
        
        h = {'straight': 0,
             'left': 1,
             'right': 2}
        
        val = None
        if action == last_action:
            val = h['straight']
        ##############
        elif action == 'WEST' and last_action == 'SOUTH':
            val = h['right']
        
        elif action == 'WEST' and last_action == 'NORTH':
            val = h['left']
        ###################
        elif action == 'EAST' and last_action == 'SOUTH':
            val = h['left']
        
        elif action == 'EAST' and last_action == 'NORTH':
            val = h['right']
        ##################
        elif action == 'NORTH' and last_action == 'EAST':
            val = h['left']
        
        elif action == 'NORTH' and last_action == 'WEST':
            val = h['right']
            
        #####
        elif action == 'SOUTH' and last_action == 'WEST':
            val = h['left']
            
        elif action == 'SOUTH' and last_action == 'EAST':
            val = h['right']
        
        return val

In [6]:
class dqnAgent:
    """
    Given an environment state, choose an action, and learn from the reward
    https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c
    https://towardsdatascience.com/deep-q-learning-tutorial-mindqn-2a4c855abffc
    https://www.researchgate.net/post/What-are-possible-reasons-why-Q-loss-is-not-converging-in-Deep-Q-Learning-algorithm
    """

    def __init__(self, model=None, epsilon = 1.0, epsilon_min = 0.05, frames_per_step=4):

        self.StateTrans = StateTranslator()
        self.state_shape  = 11*7*3 + 1 + 4 # 3 one hot boards, plus player head, plus last direction vec
        print('my state shape is:', self.state_shape)
        self.memory  = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = 0.990
        self.learning_rate = 0.001
        self.tau = .125

        if model == None:
            self.model = self.create_model()
        else:
            self.model = model
        self.target_model = self.create_model()

    def create_model(self):
        model   = Sequential()
        model.add(Dense(100, input_dim=self.state_shape, activation="relu"))
        model.add(Dense(50, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(12, activation="relu"))
        model.add(Dense(3))
        model.compile(loss="MSE",
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return random.choice([0,1,2])

        action_values = self.model.predict(state.reshape(-1, self.state_shape))[0]
        action = np.argmax(action_values)

        return action
    
    def translate_state(self, observation, configuration):
        state = self.StateTrans.get_state(observation, configuration)
        return state

    def __call__(self, observation, configuration):
        
        state = self.translate_state(observation, configuration)
        action = self.act(state)
        # State translator will take in 0, 1, 2 and return straight, left or right, which in turn will 
        # be translated into a kaggle Action
        action_class = self.StateTrans.translate_action_to_class(action)
        
        #Store our last action in our state translator for future reference
        action_text = self.StateTrans.translate_action_to_text(action_class)
        self.StateTrans.set_last_action(action_text)
        
        return action_text
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size:
            return

        samples = random.sample(self.memory, batch_size)
        ########################
        # This can be sped up significantly, but processing all samples in batch rather than 1 at a time
        ####################
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state.reshape(-1, self.state_shape))
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state.reshape(-1, self.state_shape))[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state.reshape(-1, self.state_shape), target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)


### Testing some env stuff

In [7]:
env = make("hungry_geese", debug=True)

In [8]:
env.reset(num_agents=4)

[{'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60,
   'step': 0,
   'geese': [[7], [59], [55], [51]],
   'food': [10, 46],
   'index': 0},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 1},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 2},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 3},
  'status': 'ACTIVE'}]

In [9]:
#env.step(['NORTH', 'SOUTH','NORTH', 'SOUTH'])
state_dict = env.step(['WEST', 'SOUTH', 'NORTH', 'SOUTH'])
#Each step returns a state dict that includes an action, reward, info, and observation (where the food and geese are stored)

In [10]:
state_dict

[{'action': 'WEST',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60,
   'step': 1,
   'geese': [[6], [70], [44], [62]],
   'food': [10, 46],
   'index': 0},
  'status': 'ACTIVE'},
 {'action': 'SOUTH',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 1},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 2},
  'status': 'ACTIVE'},
 {'action': 'SOUTH',
  'reward': 201,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 3},
  'status': 'ACTIVE'}]

In [11]:
env.render(mode="ipython")

In [13]:
results = env.run([GreedyAgent(), GreedyAgent(), GreedyAgent(), GreedyAgent()])

Opposite action: (0, <Action.EAST: 2>, <Action.WEST: 4>)
Opposite action: (1, <Action.NORTH: 1>, <Action.SOUTH: 3>)
Opposite action: (3, <Action.NORTH: 1>, <Action.SOUTH: 3>)


In [14]:
env.render(mode="ipython")

### Test StateTranslator

In [15]:
st_test = StateTranslator()

In [16]:
env = make("hungry_geese", debug=True)
config = env.configuration

state_dict = env.reset(num_agents=4)[0]
observation = state_dict['observation']
action = state_dict['action']


In [17]:
state_dict

{'action': 'NORTH',
 'reward': 0,
 'info': {},
 'observation': {'remainingOverageTime': 60,
  'step': 0,
  'geese': [[32], [19], [29], [54]],
  'food': [46, 50],
  'index': 0},
 'status': 'ACTIVE'}

In [18]:
st_test.set_last_action(action)

In [19]:
state_test = st_test.get_state(observation, config)
state_test

array([1.        , 0.        , 0.        , 0.        , 0.41558442,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [20]:
assert len(state_test) == 11*7*3 + 1 + 4 # 3 one hot boards, plus player head, plus last direction vec

### Its so fucking annoying. I need the agents to return Action.Whatever when called, I need to use a string when using env.step, and the output of my neural net is naturally 0, 1, 2, and the action returned by step is a string

#### I need to revisit how I am storing my states for future. I may need to include a last action item so that the dqn is able to understand what the 0, 1, 2 are for

In [21]:
st_test.translate_action_to_class(0)

<Action.NORTH: 1>

In [22]:
st_test.set_last_action = 'NORTH'
st_test.translate_text_to_int('WEST')

1

In [23]:
st_test.translate_action_to_text(Action.EAST)

'EAST'

### Training script

In [27]:
steps_per_ep = 200
num_episodes = 1000

env = make("hungry_geese", debug=True)
config = env.configuration

dqn = dqnAgent()
agent2 = GreedyAgent()
agent3 = GreedyAgent()
agent4 = GreedyAgent()

agents = [dqn, agent2, agent3, agent4]

results_dic = {}
for ep in range(num_episodes):
    
    print('episode number: ', ep)
    state_dict = env.reset(num_agents=4)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']
    
    reward = state_dict['reward']
    action = state_dict['action']
    
    dqn.StateTrans.set_last_action(action)
    cur_state = dqn.StateTrans.get_state(observation, config)
    
    
    done = False
    for step in range(steps_per_ep):
        actions = []
        for agent in agents:
            action = agent(observation, config)
            actions.append(action)
        
        state_dict = env.step(actions)[0]
        observation = state_dict['observation']
        print(observation)
        reward = state_dict['reward']
        action = state_dict['action']
        status = state_dict['status']
        
        if status != "ACTIVE":
            done = True
            
        # Check if my goose died
        if len(observation['geese'][my_goose_ind])<1:
            done = True
            reward = -100
            
        if done:
            print('Done, Step: ', step)
            print('status, ', status)
            results_dic[ep] = reward
            directory = "first_agent"
            dqn.save_model(directory + f"/trial-{ep}")
            with open(directory + "/results_dic.pkl", 'wb') as f:
                pickle.dump(results_dic, f)
            break
        
        action_for_model = dqn.StateTrans.translate_text_to_int(action)
        new_state = dqn.StateTrans.get_state(observation, config)
        
        dqn.remember(cur_state, action_for_model, reward, new_state, done)

        cur_state = new_state
        if step%5 == 0:
            dqn.replay()        
            dqn.target_train()

        if step%50 == 0:
            print(f'We survived {step} steps')
            directory = "first_agent"
            dqn.save_model(directory + f"/trial-{ep}")
        


my state shape is: 236
episode number:  0
Goose Collision: NORTH
{'remainingOverageTime': 60, 'step': 1, 'geese': [[], [32], [8], [43, 33]], 'food': [61, 4], 'index': 0}
Done, Step:  0
status,  DONE
INFO:tensorflow:Assets written to: first_agent/trial-0\assets
episode number:  1
{'remainingOverageTime': 60, 'step': 1, 'geese': [[36], [16], [38], [42]], 'food': [66, 59], 'index': 0}
We survived 0 steps
INFO:tensorflow:Assets written to: first_agent/trial-1\assets
{'remainingOverageTime': 60, 'step': 2, 'geese': [[47], [27], [49], [53]], 'food': [66, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 3, 'geese': [[48], [38], [60], [64]], 'food': [66, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 4, 'geese': [[49], [37], [59, 60], [63]], 'food': [66, 14], 'index': 0}
Opposite action: (3, <Action.EAST: 2>, <Action.WEST: 4>)
{'remainingOverageTime': 60, 'step': 5, 'geese': [[50], [26], [70, 59], []], 'food': [66, 14], 'index': 0}
{'remainingOverageTime': 60, 'step': 6, 'geese': [

INFO:tensorflow:Assets written to: first_agent/trial-3\assets
episode number:  4
{'remainingOverageTime': 60, 'step': 1, 'geese': [[71], [62], [26], [14]], 'food': [22, 40], 'index': 0}
We survived 0 steps
INFO:tensorflow:Assets written to: first_agent/trial-4\assets
{'remainingOverageTime': 60, 'step': 2, 'geese': [[60], [51], [15], [3]], 'food': [22, 40], 'index': 0}
{'remainingOverageTime': 60, 'step': 3, 'geese': [[49], [40, 51], [4], [69]], 'food': [22, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 4, 'geese': [[38], [39, 40], [3], [68]], 'food': [22, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 5, 'geese': [[37], [50, 39], [14], [2]], 'food': [22, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 6, 'geese': [[36], [61, 50], [25], [13]], 'food': [22, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 7, 'geese': [[35], [62, 61], [26], [14]], 'food': [22, 59], 'index': 0}
{'remainingOverageTime': 60, 'step': 8, 'geese': [[24], [51, 62], [15], [3]], 'food': 

KeyboardInterrupt: 