In [9]:
import numpy as np
from agent_code.training_data.memory import ReplayMemory
import torch
import io

In [2]:
from collections import namedtuple, deque

# first we extract all information from the memory
ActionWithEvent = namedtuple("Transition", ("game_state", "state", "action", "score", "events", "round_reward", "discounted_reward"))
# then we change it into network_input, action, discounted_reward
Datapoint = namedtuple("Datapoint", ("network_input", "action", "discounted_reward"))

In [3]:
replay = ReplayMemory()

In [10]:
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

In [2]:
import pickle
from collections import deque
data = deque()
with open("training_data.pt", "rb") as f:
    data = pickle.load(f)

In [4]:
data[0]

Transition(state={'round': 1, 'step': 1, 'field': array([[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1],
       [-1,  0,  0,  0,  1,  1,  0,  1,  1,  0,  1,  1,  0,  0,  0,  0,
        -1],
       [-1,  0, -1,  1, -1,  0, -1,  0, -1,  1, -1,  0, -1,  1, -1,  0,
        -1],
       [-1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  1,  1,
        -1],
       [-1,  0, -1,  1, -1,  0, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
        -1],
       [-1,  0,  0,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  1,  0,  1,
        -1],
       [-1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
        -1],
       [-1,  1,  0,  1,  1,  0,  1,  0,  0,  0,  0,  1,  1,  1,  0,  1,
        -1],
       [-1,  0, -1,  1, -1,  1, -1,  0, -1,  1, -1,  0, -1,  1, -1,  0,
        -1],
       [-1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,
        -1],
       [-1,  1, -1,  1, -1,  1, -1,  1, -1,  0, -1,  1, -1,  1, -1,  1,
        -1],
       [-1,  1,

In [25]:
from itertools import chain


data = deque(chain.from_iterable(data))

In [29]:
with open("training_data.pt", "wb") as f:
    pickle.dump(data, f)

In [4]:
replay.load_from_file('agent_code/training_data/training_data_solo.pt')

In [5]:
len(replay)

840098

In [6]:
memories = replay.get()

In [7]:
import events as e

In [8]:
def get_score(memory):
    return memory.state["self"][1]

def get_round(memory):
    return memory.state["round"]

def get_step(memory):
    return memory.state["step"]

def get_events(memory):
    return memory.reward

def get_reward_from_events(events):
    rewards = {
        e.CRATE_DESTROYED: 0.1,
        e.KILLED_SELF: -1,
        e.GOT_KILLED: -4,
        e.INVALID_ACTION: -0.1,
        e.COIN_COLLECTED: 1,
        e.KILLED_OPPONENT: 5,
    }
    reward = sum([rewards.get(event, 0.) for event in events])
    return round(reward, 2)

In [9]:
get_events(memories[1481])

['MOVED_UP']

In [10]:
# discount factor reduces the importance of future rewards
# after 10 steps the reward is only 50% of the original reward
discount_factor = 0.93
print(discount_factor ** 10)
# score normalization
# score should be between -1 and 1, since player kills normally give 5 score and coins 1, we normalize by dividing by 5
# we will have to restrict the score to be between -1 and 1
# additionally, since multiple rewards can be given at the same time, and the discounted score from the future is added to the current score, we need to divide by an additional factor
# additional factor is 2 so that the score maxes out if you get 2 kills at once
score_reduction_factor = 1 / (5 * 2)
print(5 * score_reduction_factor * discount_factor + 1 * score_reduction_factor * discount_factor**2 + 1 * score_reduction_factor * discount_factor**3)

0.4839823071792934
0.6319257


In [11]:
rewards = np.zeros(len(memories))

for idx, memory in enumerate(memories):
    reward = get_reward_from_events(memory.reward)
    rewards[idx] = reward

In [12]:
memories.reverse()

In [13]:
from agent_code.mcts.game_state import state_from_game_state

In [14]:
actions_with_event = deque()
running_reward = 0
for idx, memory in enumerate(memories):
    if memory.next_state is None:
        running_reward = 0
    
    reward = get_reward_from_events(memory.reward)
    running_reward = discount_factor * running_reward + reward

    state = state_from_game_state(memory.state)

    bounded_running_reward = min(1, max(-1, running_reward))

    score = memory.next_state["self"][1] if memory.next_state else memory.state["self"][1]

    action_with_event = ActionWithEvent(
        game_state=memory.state,
        state=state,
        action=memory.action,
        score=score,
        events=memory.reward,
        round_reward=reward,
        discounted_reward=bounded_running_reward
    )
    
    actions_with_event.append(action_with_event)


In [15]:
len(memories), len(actions_with_event)

(840098, 840098)

In [16]:
memories.reverse()
actions_with_event.reverse()

In [17]:
datapoints = deque()
for action_with_event in actions_with_event:
    datapoint = Datapoint(
        network_input=action_with_event.state,
        action=action_with_event.action,
        discounted_reward=action_with_event.discounted_reward
    )
    datapoints.append(datapoint)

In [18]:
import pickle

with open("training_data_solo_cleaned.pt", "wb") as f:
    pickle.dump(datapoints, f)