In [1]:
import torch
import numpy as np
import pickle
from collections import namedtuple

In [2]:
Datapoint = namedtuple("Datapoint", ("network_input", "action", "discounted_reward"))
with open("../agent_code/training_data/training_data_cleaned.pt", "rb") as f:
    data = pickle.load(f)

In [3]:
rewards = np.array([d.discounted_reward for d in data])

In [4]:
np.max(rewards), np.min(rewards), np.mean(rewards), np.std(rewards)

(1.0, -1.0, 0.1653856406661175, 0.47785096305970176)

In [5]:
np.count_nonzero(rewards == 0), np.count_nonzero(rewards > 0), np.count_nonzero(rewards < 0)

(23906, 221310, 138565)

In [6]:
import pandas as pd

pd.DataFrame(rewards).describe()

Unnamed: 0,0
count,383781.0
mean,0.165386
std,0.477852
min,-1.0
25%,-0.03817
50%,0.041675
75%,0.471077
max,1.0


# Now testing a different reward function

In [64]:
import pickle
import sys
sys.path.append("..") # NamedTuple definition
with open("../agent_code/training_data/training_data_solo.pt", "rb") as f:
    data = pickle.load(f)

In [65]:
events = [d.reward for d in data]

In [66]:
import events as e

def get_reward_from_events(events):
    rewards = {
        e.CRATE_DESTROYED: 0.1, # incentivize destroying crates
        e.KILLED_SELF: -1,
        e.GOT_KILLED: -4, # KILLED_SELF + GOT_KILLED = -4
        e.INVALID_ACTION: -0.1, # WAIT instead
        e.COIN_COLLECTED: 1,
        e.KILLED_OPPONENT: 5, # Killing an opponent at the same time as getting killed is a net reward of 1
    }
    reward = sum([rewards.get(event, 0.) for event in events])
    return round(reward, 2) # .2 + .1 shenanigans

In [67]:
rewards = np.array([get_reward_from_events(event) for event in events])

In [68]:
len(rewards), np.max(rewards), np.min(rewards), np.mean(rewards), np.std(rewards)

(40061, 1.7, -0.1, 0.04989141559122339, 0.1757944043214819)

In [69]:
np.count_nonzero(rewards == 0), np.count_nonzero(rewards > 0), np.count_nonzero(rewards < 0)

(35310, 4593, 158)

# Current setup

In [70]:
discount_factor = 0.93
score_reduction_factor = 1 / 5

In [71]:
memories = list(data)

In [72]:
memories.reverse()
rewards_clean = np.zeros(len(memories))
rewards = np.zeros(len(memories))

running_reward = 0
for idx, memory in enumerate(memories):
    if memory.next_state is None:
        running_reward = 0
    
    reward = get_reward_from_events(memory.reward)

    last_rewards = []
    for i in range(1, 11):
        if memories[idx - i].next_state is None:
            break
        if idx - i < 0:
            break
        last_rewards.append(rewards_clean[idx - i] * discount_factor ** i) 

    running_reward = reward + sum(last_rewards)

    bounded_running_reward = min(1, max(-1, running_reward))

    rewards_clean[idx] = reward
    rewards[idx] = bounded_running_reward

memories.reverse()
rewards = np.flip(rewards)

In [73]:
len(rewards), np.max(rewards), np.min(rewards), np.mean(rewards), np.std(rewards)

(40061, 1.0, -0.03982991293924298, 0.3656755675369735, 0.31417155660068624)

In [74]:
rewards[:50]

array([0.07443866, 0.08004157, 0.0860662 , 0.18934076, 0.20359222,
       0.21891636, 0.02034017, 0.12939804, 0.13913767, 0.1496104 ,
       0.1608714 , 0.41497115, 0.44620554, 0.4797909 , 0.30085044,
       0.32349509, 0.34784418, 0.37402601, 0.4021785 , 0.43245   ,
       0.65859292, 0.70816443, 0.22383272, 0.24068035, 0.25879607,
       0.27827535, 0.2992208 , 0.3217428 , 0.34596   , 0.372     ,
       0.4       , 0.        , 0.14519469, 0.15612332, 0.16787454,
       0.18051026, 0.67807936, 0.72911759, 0.78399741, 0.84300797,
       0.90646018, 0.97468837, 1.        , 0.94955169, 1.        ,
       1.        , 1.        , 0.19409706, 0.20870651, 0.2244156 ])

In [76]:
memories[6].reward

['INVALID_ACTION']