In [3]:
import numpy as np
import tensorflow as tf
import random
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, Conv2D
from tensorflow.keras.layers import BatchNormalization, Activation, ZeroPadding2D
from tensorflow.keras.models import Sequential, Model

from tensorflow.keras import backend as K

from settings import s, e


pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


In [14]:
class replay_buffer():
    
    def __init__(self, size = 1000, dimension=3):
        self.size = size
        self.buffer=[]
        self.dimension=dimension
        for i in range(dimension):
            self.buffer.append([])
            
    def add(self, *args):
        experience = args
        for i in range(self.dimension):
            if len(self.buffer[i])+len(experience[i])>=self.size:
                self.buffer[i]=self.buffer[i][(len(self.buffer[i])+len(experience[i]))-self.size:]
            self.buffer[i].extend(experience[i])
            
    def sample(self,batch_size):
        indexes=range(len(self.buffer[0]))
        rand=random.sample(indexes, batch_size)
        batch=[]
        for i in range(self.dimension):
            batch.append([])
            batch[i]=np.array(self.buffer[i])[rand]
        return batch        
    
    def clear(self):
        self.buffer=[]
        for i in range(self.dimension):
            self.buffer.append([])
        

In [7]:
def delayed_reward(reward, disc_factor):
    reward=np.array(reward)
    dela_rew=np.empty_like(reward)
    storage=0
    for i in range(len(reward)):
        j=len(reward)-i
        dela_rew[j]=storage*disc_factor+reward[j]
        storage = storage+reward[j]
    return dela_rew


In [15]:

choices = ['RIGHT', 'LEFT', 'UP', 'DOWN', 'BOMB', 'WAIT']

# channels: arena, self, others (3), bombs, explosions, coins -> c = 8 (see get_x)
c = 8


def setup(agent):
    K.clear_session()
    
    D = len(choices)
    
    #========================
    #  Define Model
    #========================
    
    inputs = Input(shape=(s.cols, s.rows, c))
    x = Conv2D(16, 3)(inputs)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    pred = Dense(D, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=pred)
    #model.compile(loss="hinge", optimizer="adam")

    agent.model = model

    
    #========================
    #  Define Training Update
    #========================

    action_holder = Input(shape=(1,), dtype='int32')  # in j=0,...,D-1
    reward_holder = Input(shape=(1,))   ##target Q???
    
    
    
    # applies a mask to the outputs so that only the prediction for the chosen action is considered
    responsible_weight = tf.reduce_sum(tf.boolean_mask(pred, tf.one_hot(action_holder, D)[:,0,:])) ###Qvalue

    loss = - (tf.log(responsible_weight) * reward_holder)

    optimizer = tf.train.AdamOptimizer(0.1)
    update = optimizer.minimize(loss)
    
    
    # Initialize all variables
    init_op = tf.global_variables_initializer()
    K.get_session().run(init_op)

    # the alternative Keras way:
    #training_model = Model(inputs=[inputs, action_holder, reward_holder], outputs=loss)
    #training_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer='Adam')

    
    agent.update = update
    
    agent.inputs = inputs
    agent.action_holder = action_holder
    agent.reward_holder = reward_holder
    
    #agent.Xs_episode = replay_buffer() 
    #agent.actions_episode = replay_buffer()
    #agent.rewards_episode = replay_buffer()
    agent.buffer = replay_buffer() #total buffer
    agent.episode_buffer = replay_buffer() #episode buffer
    #agent.Xs = replay_buffer()
    #agent.actions=replay_buffer()
    #agent.rewards= replay_buffer()
    agent.epsilon=0.1

    np.random.seed()

def act(agent):
    # agent.game_state
    print('Epsilon greedy')
    X = get_x(agent.game_state)
    agent.X = X

    #agent.next_action = np.random.choice(choices, p=[.23, .23, .23, .23, .08, .00])
    if np.random.rand(1) > agent.epsilon:
        pred = agent.model.predict(np.array([X]))
        agent.action_choice = np.argmax(pred)
        agent.next_action = choices[agent.action_choice]
    else:
        agent.next_action = np.random.choice(choices, p=[.23, .23, .23, .23, .08, .00])
    print("================================")
    print(agent.next_action)

def reward_update(agent):
    print('Update')
    events = agent.events
    reward = 0
    reward += events.count(e.COIN_FOUND)
    reward += events.count(e.COIN_COLLECTED)
    reward += 2 * events.count(e.KILLED_OPPONENT)
    reward -= 10 * events.count(e.KILLED_SELF)
    reward -= 5 * events.count(e.GOT_KILLED)
    reward += 20 * events.count(e.SURVIVED_ROUND)
    agent.reward = reward
    agent.episode_buffer.add(agent.X, agent.action_choice, agent.reward)
    agent.Xs.append([agent.X])
    agent.actions.append([agent.action_choice])
    agent.rewards.append([agent.reward])

def end_of_episode(agent):
    #model = agent.model
    #model.train_on_batch(x, y, class_weight=None)
    print(agent.episode_buffer.buffer[:,2])
    #agent.episode_buffer.buffer[:,2]=delayed_reward(agent.episode_buffer.buffer[:,2],disc_factor)##delayed rewards
    x, action, reward = agent.episode_buffer.buffer
    agent.buffer.add(x, action, delayed_reward(reward))
    agent.episode_buffer.clear() #clear episode_buffer
    #batch=agent.buffer.sample(10)#get batch to train on random experiences
    #agent.rewards_buffer.add(delayed_reward(agent.rewards_episode.buffer, disc_factor)) #add delayed rewards
    #agent.Xs_buffer.add(agent.Xs_episode.buffer)
    #agent.actions_buffer.add(agent.actions.buffer)
    agent.Xs, agent.actions, agent.rewards = agent.buffer.sample(10)  #get batch to train on random experiences
    #agent.actions=batch[:,1]
    #agent.rewards=batch[:,2]
    sess = K.get_session()
    sess.run([agent.update], feed_dict={agent.inputs: np.array(agent.Xs), agent.reward_holder:np.array(agent.rewards),agent.action_holder:np.array(agent.actions)})
    print('End of Episode')

In [1]:
def build_network():
    inputs = Input(shape=(s.cols, s.rows, c))
    x = Conv2D(16, 3)(inputs)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    pred = Dense(D, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=pred)

    return model, pred
    
#create networks
q_net, q_pred = build_network()
t_net, t_pred = build_network()

action_holder = Input(shape=(1,), dtype='int32')  # in j=0,...,D-1
reward_holder = Input(shape=(1,))   ##target Q???
q_target_holder = Input(shape=(1,))

# applies a mask to the outputs so that only the prediction for the chosen action is considered
output = q_pred+ tf.subtract(t_pred, tf.reduce_mean(t_pred, axis=1, keepdims=True))
responsible_weight_q = tf.reduce_sum(tf.boolean_mask(output, tf.one_hot(action_holder, D)[:,0,:])) ###Qvalue

loss = - (tf.log(responsible_weight) * reward_holder)

optimizer = tf.train.AdamOptimizer(0.1)
update = optimizer.minimize(loss)
    
    
# Initialize all variables
init_op = tf.global_variables_initializer()
K.get_session().run(init_op)

SyntaxError: unexpected EOF while parsing (<ipython-input-1-d1b4c789df83>, line 2)

In [4]:
end_of_episode(agent)

End of Episode


'step' The number of steps in the episode so far, starting at 1.

'arena' A 2D numpy array describing the tiles of the game board. Its entries are 1 for
crates, −1 for stone walls and 0 for free tiles.

'self' A tuple (x, y, n, b) describing your own agent. x and y are its coordinates on
the board, n its name and b ∈ {0, 1} a ag indicating if the 'BOMB' action is
possible (i.e. no own bomb is currently ticking).

'others' A list of tuples like the one above for all opponents that are still in the game.

'bombs' A list of tuples (x, y, t) of coordinates and countdowns for all active bombs.

'explosions' A 2D numpy array stating, for each tile, for how many steps an explosion will
be present. Where there is no explosion, the value is 0.

'coins' A list of coordinates (x, y) for all currently collectable coins.

In [5]:
def get_x(game_state):
    arena = game_state['arena']
    self = game_state['self']
    others = game_state['others']
    bombs = game_state['bombs']
    explosions = game_state['explosions']
    coins = game_state['coins']
    # channels: arena, self, others (3), bombs, explosions, coins -> c = 8
    c = 8
    X = np.zeros((s.cols, s.rows, c))
    
    X[:,:,0] = arena
    
    X[self[0],self[1],1] = self[3]
    
    for i in range(len(others)):
        X[others[i][0], others[i][1], i+2] = others[i][3]
    
    for i in range(len(bombs)):
        X[bombs[i][0], bombs[i][1], 5] = bombs[i][2]
    
    X[:,:,6] = explosions
    
    for i in range(len(coins)):
        X[coins[i][0], coins[i][1], 7] = 1

    return X

In [16]:
class replay_buffer():
    def __init__(self, size = 5):
        self.size = size
        self.buffer = []
    def add(self, experience):
        if len(self.buffer)+len(experience)>=self.size:
            self.buffer=self.buffer[(len(self.buffer)+len(experience))-self.size:]
        self.buffer.extend(experience)
    def sample(self,batch_size):
        return random.sample(self.buffer, batch_size)
        
            

In [46]:
rlist=np.array([np.array([1,2,3]),np.array([4,5,6]),1,2,3,4,5])

print(np.array(rlist)[1:])

[array([4, 5, 6]) 1 2 3 4 5]


In [53]:
np.random.rand(1)

array([0.79653437])

In [10]:
rlist=[1,2,3,4,5,6,7]

In [13]:
def delayed_reward(reward, disc_factor):
    """ function that calculates delayed rewards for given list of rewards and discount_factor."""
    reward_array=np.array(reward)
    dela_rew=np.empty_like(reward)
    storage=0
    for i in range(len(reward)):
        print('1')
        j=len(reward)-i-1
        print('2')
        dela_rew[j]=storage*disc_factor+reward[j]
        print('3')
        storage = storage+reward[j]
    print("end")
    return dela_rew


In [40]:
buffer=replay_buffer(dimension=2)
X=[np.array([1,2]),np.array([3,2]),np.array([4,3]),([5,4]),5,6,7]
y = [1,2,3,4,5,6,7]
buffer.add(y,y)
a,b=buffer.sample(2)
print(a,b, buffer.sample(3))

[1 2] [1 2] [array([7, 5, 2]), array([7, 5, 2])]


In [62]:
class SumTree(object):
    def __init__(self, capacity): #initalize tree and data with only zeros
        
        self.capacity=capacity
        self.tree=np.zeros(2*capacity-1)#actual tree
        self.data=np.zeros(capacity, dtype=object)#here the data is stored
        self.pointer=0
        self.total_priority=0   #for normalization get priority of root

        
    def update(self, index, priority):
        old_priority=self.tree[index]
        self.tree[index]=priority
        
        #update upper parent nodes
        while index!=0: #as long as root is not reached
            
            index=int(np.floor((index-1)/2))
            self.tree[index]= self.tree[index]+priority-old_priority
            
    def add(self, priority, new_data):
        
        self.data[self.pointer]=new_data
        t_index=self.pointer+self.capacity-1
        self.update(t_index, priority)
            
        self.pointer+=1
        if self.pointer>=self.capacity:
            self.pointer=0
        
        self.total_priority=self.tree[0]
        
    def get_leave(self, value):
        bottom=False
        parent=0
        
        while not bottom:
            
            left_child=parent*2+1
            right_child=left_child+1
            if left_child>=len(self.tree):
                break
            
            if self.tree[left_child]>=value:
                parent=left_child
            else:
                value-=left_child
                parent=right_child
        leave_index=parent
        return leave_index, self.tree[leave_index], self.data[leave_index-self.capacity+1]
            

            
class PER_buffer(object):
    
    def __init__(self, capacity, PER_a, PER_b, PER_e, anneal):
    
        self.capacity=capacity   
        self.tree=SumTree(capacity)
        self.default_max_p=1
        self.PER_a=PER_a  #Hyperparameter to introduce randomness in PER from 0to1
        self.PER_b=PER_b  #Hyperparameter to scale influence of weight needs to be annealed during lerning
        self.anneal=anneal #annealing factor for PER_b
        self.PER_e=PER_e      #constant to insure that priority never gets zero
        
    def add(self, *args):
        experience = args
        #new experience gets maximum priority
        max_p=np.max(self.tree.tree[-self.tree.capacity:]) #search for maximal priority in leave nodes
        
        if max_p==0:                        #priority can't be zero because then experience would never be picked
            max_p=self.default_max_p
            
        self.tree.add(max_p, experience)
    def sample(self, k):  #k:how many experiences in one sample
        
        priority_range = self.tree.total_priority/k
        minibatch=[]
        weights=[]
        
        #to normalize the weights, the maximal weight needs to be calculated
        max_weight=1/(k*np.min(self.tree.tree[-self.tree.capacity:]))**self.PER_b
        
        for i in range(k):
            lower_bound = priority_range*i
            upper_bound=lower_bound+priority_range
            
            #now get a random sample from that range
            value=np.random.uniform(lower_bound,upper_bound)
            leave_index, value_priority, value_data = self.tree.get_leave(value)
            
            prob_weight=value_priority/self.tree.total_priority
            weight=(1/(k*prob_weight)**self.PER_b)/max_weight
            
            weights.append(prob_weight)
            minibatch.append(value_data)
        self.PER_b=np.minimum(1., self.PER_b+self.anneal)
        return minibatch, weights
    
    def update(self, idxs, errors):
        ''' It is important to use data idx here, not tree '''
        
        priorities=np.abs(errors)+self.PER_e
        priorities=np.minimum(priorities, self.default_max_p)
        pri_a=priorities**self.PER_a   #modified priority that is actually used
        idxs+=self.capacity-1
        for i, p in zip(idxs, pri_a):
            self.tree.update(i, p)
            

In [65]:
a=PER_buffer(5, 0.5, 0.1, 0.1, 0.1)
a.add(1,2,3)
a.add(4,5,6)
a.add(7,8,9)
a.add(10,11,12)
a.add(13,14,15)
a.add(0,0,0)
a.add(1,1,1)
idx=np.arange(5)
errors=np.array([0,0.1,0.2,0.3,0.99])
a.update(idx, errors)
print(a.tree.data, a.tree.tree[-len(a.tree.data):])

[(0, 0, 0) (1, 1, 1) (7, 8, 9) (10, 11, 12) (13, 14, 15)] [0.31622777 0.4472136  0.54772256 0.63245553 1.        ]


In [19]:
np.minimum([1,2,3,4],3)


array([1, 2, 3, 3])

In [18]:
np.minimum(1.,0.2)

0.2