In [1]:
import numpy as np
from agent_code.training_data.memory import ReplayMemory

In [2]:
from collections import namedtuple

# first we extract all information from the memory
ActionWithEvent = namedtuple("Transition", ("game_state", "state", "action", "score", "events", "round_reward", "discounted_reward"))
# then we change it into network_input, action, discounted_reward
Datapoint = namedtuple("Datapoint", ("network_input", "action", "discounted_reward"))

In [3]:
replay = ReplayMemory()

In [4]:
replay.load_from_file('agent_code/training_data/training_data.pt')

In [5]:
len(replay)

383781

In [6]:
memories = replay.get()

In [7]:
import events as e

In [8]:
def get_score(memory):
    return memory.state["self"][1]

def get_round(memory):
    return memory.state["round"]

def get_step(memory):
    return memory.state["step"]

def get_reward_from_events(events):
    rewards = {
        e.CRATE_DESTROYED: 0.1,
        e.KILLED_SELF: -1,
        e.GOT_KILLED: -0.9,
        e.INVALID_ACTION: -0.1,
    }
    reward = sum([rewards.get(event, 0.) for event in events])
    return round(reward, 2)

In [18]:
get_score(memories[106])

1

In [19]:
last_score = 0
for memory in memories:
    score = get_score(memory) 
    diff = score - last_score
    reward = diff + get_reward_from_events(memory.reward)
    last_score = score
    if reward != 0:
        print(get_step(memory), get_round(memory), get_score(memory), reward, get_reward_from_events(memory.reward), memory.reward)

    if not memory.next_state:
        print(get_round(memory))
        last_score = 0

7 1 0 0.5 0.5 ['MOVED_RIGHT', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED', 'CRATE_DESTROYED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
15 1 0 0.3 0.3 ['MOVED_LEFT', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
26 1 0 0.2 0.2 ['MOVED_RIGHT', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
27 1 0 -0.1 -0.1 ['INVALID_ACTION']
34 1 0 0.1 0.1 ['MOVED_UP', 'BOMB_EXPLODED', 'CRATE_DESTROYED']
56 1 0 0.2 0.2 ['MOVED_UP', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
69 1 0 0.1 0.1 ['INVALID_ACTION', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
70 1 0 -0.1 -0.1 ['INVALID_ACTION']
94 1 0 0.2 0.2 ['MOVED_RIGHT', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
106 1 1 1.0 0.0 ['MOVED_RIGHT']
111 1 1 -0.1 -0.1 ['INVALID_ACTION', 'BOMB_EXPLODED']
118 1 2 1.0 0.0 ['MOVED_DOWN']
126 1 2 -0.1 -0.1 ['INVALID_ACTION']
145 1 2 0.2 0.2 ['MOVED_LEFT', 'BOMB_EXPLODED', 'CRATE_DESTROYED', 'CRATE_DESTROYED']
152 1 2 0.1 0.1 ['MOVED_DOWN', 'BO

In [76]:
memories[0].reward

['MOVED_DOWN']

In [None]:
current_round = -1
start_indexes = []
for idx, memory in enumerate(memories):
    if memory.state["round"] != current_round:
        start_indexes.append(idx)
        current_round = memory.state["round"]

In [7]:
memories.reverse()

In [8]:
memories[0].state["round"]

In [9]:
# discount factor reduces the importance of future rewards
# after 10 steps the reward is only 50% of the original reward
discount_factor = 0.93
print(discount_factor ** 10)
# score normalization
# score should be between -1 and 1, since player kills normally give 5 score and coins 1, we normalize by dividing by 5
# we will have to restrict the score to be between -1 and 1
# additionally, since multiple rewards can be given at the same time, and the discounted score from the future is added to the current score, we need to divide by an additional factor
# additional factor is 2 so that the score maxes out if you get 2 kills at once
score_reduction_factor = 1 / (5 * 2)
print(5 * score_reduction_factor * discount_factor + 1 * score_reduction_factor * discount_factor**2 + 1 * score_reduction_factor * discount_factor**3)

0.4839823071792934
0.6319257


In [66]:
score_increases = {}
for idx, memory in enumerate(memories):
    if idx + 1 >= len(memories):
        break
    if get_score(memories[idx]) > get_score(memories[idx + 1]):
        score_increases[len(memories) - idx - 1] = get_score(memories[idx]) - get_score(memories[idx + 1])

In [67]:
memories.reverse()

In [68]:
memories[0].state["step"]

1

In [75]:
get_score(memories[382307])

8

In [72]:
score_increases

{383447: 1,
 383433: 1,
 383337: 1,
 383324: 1,
 383269: 1,
 383241: 1,
 383169: 1,
 383159: 1,
 383071: 1,
 383036: 5,
 383010: 1,
 383002: 1,
 382994: 1,
 382990: 1,
 382652: 5,
 382597: 1,
 382572: 1,
 382398: 1,
 382307: 5,
 382255: 1,
 382238: 1,
 382219: 1,
 381942: 1,
 381936: 1,
 381805: 1,
 381497: 1,
 381485: 1,
 381483: 1,
 381368: 1,
 381015: 1,
 381012: 5,
 380989: 1,
 380988: 1,
 380942: 1,
 380907: 1,
 380780: 5,
 380754: 1,
 380302: 1,
 380232: 1,
 380017: 1,
 380014: 1,
 379903: 1,
 379814: 5,
 379782: 5,
 379769: 1,
 379546: 1,
 379235: 1,
 378938: 5,
 378810: 1,
 378763: 1,
 378653: 1,
 378600: 1,
 378401: 1,
 378376: 1,
 378256: 1,
 378176: 1,
 377539: 5,
 377462: 1,
 377257: 1,
 377170: 5,
 377063: 1,
 376743: 1,
 376610: 1,
 376570: 1,
 376556: 1,
 376528: 5,
 376344: 5,
 376289: 1,
 375936: 1,
 375908: 5,
 375896: 1,
 375882: 1,
 375822: 1,
 375712: 1,
 375691: 1,
 375644: 1,
 375418: 1,
 375001: 5,
 374864: 1,
 374841: 1,
 374562: 1,
 374533: 5,
 374455: 1,
 374

In [21]:
import agent_code.mcts.deep_network as deep_network
from agent_code.mcts.game_state import state_from_game_state

In [40]:
import importlib
importlib.reload(deep_network)

<module 'agent_code.mcts.deep_network' from 'g:\\Programming\\bomberman_rl\\agent_code\\mcts\\deep_network.py'>

In [41]:
import torch_directml
device = torch_directml.device()

In [42]:
test_network = deep_network.MCTSNetwork().to(device)

In [46]:
test_network(state_from_game_state(replay.sample(1)[0][0]))

(tensor([[-0.0309]], device='privateuseone:0', grad_fn=<TanhBackward0>),
 tensor([[0.1987, 0.1115, 0.1903, 0.1209, 0.1654, 0.2131]],
        device='privateuseone:0', grad_fn=<SoftmaxBackward0>))