In [1]:
import numpy as np
np.random.seed(1337)
from keras.layers import Dense, Input, Lambda, Conv2D 
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras import backend as K
import matplotlib.pyplot as plt


Using TensorFlow backend.
  return f(*args, **kwds)


In [9]:

class A2C:
    
    def __init__(self, lr_a, lr_c, gamma, epsilon, obs_dims, action_size, batch_size=1):
        
        self.logging = False
        
        # hiperparams
        self.lr_a = lr_a                         #learning rate actor
        self.lr_c = lr_c                         #learning rate critic
        self.epsilon = epsilon                   #exploration rate ????????
        self.gamma = gamma                       #discount factor
        self.batch_size = batch_size             #number of frames the agents gets after every action, or Exp. Replay
        
        # dimensions
        self.obs_dims = obs_dims                 #dimensions of the input
        self.obs_size = np.prod(obs_dims)        #size of the input
        self.action_size = action_size           #number of action
        print("NETWORK INIT, obsdim, obsize, actsize: ", self.obs_dims, self.obs_size, self.action_size)
        self.value_size = 1                      #output dim of the cirtic
        
        # models
        self.actor, self.critic = self.build_models()
        
        #self.actor = self.build_actor()
        #self.critic = self.build_critic()
        
        self.actor_optimizer, self.critic_optimizer = self.build_actor_optimizer(), self.build_critic_optimizer()
        
        # serialization
        self.load_checkpoint = False
        self.save_checkpoint = False
        self.save_destination = "/Users/daddy/Desktop/projekt/" + str(self.epsilon) + "agent/"
        
        if self.load_checkpoint:
            self.actor.load_weights(self.save_destination + "weights_actor.h5")
            self.critic.load_weights(self.save_destination + "weights_critic.h5")
        


    
    def build_actor(self):
        
        # approximate policy and value using Neural Network
        # actor: state is input and probability of each action is output of model
        actor = Sequential()
        actor.add(Dense(24, input_dim=self.obs_size, activation='relu',
                        kernel_initializer='he_uniform'))
        actor.add(Dense(self.action_size, activation='softmax',
                        kernel_initializer='he_uniform'))
        actor.summary()
        # See note regarding crossentropy in cartpole_reinforce.py
        #actor.compile(loss='categorical_crossentropy',   #H(p, q)  = sum( p_i * log(q_i)) ,  
        #optimizer=Adam(lr=self.lr_a))                    # actor loss = sum( Adv. * log(P(a∣s)))
        return actor

    
    def build_critic(self):
        # critic: state is input and value of state is output of model

        critic = Sequential()
        critic.add(Dense(24, input_dim=self.obs_size, activation='relu',
                         kernel_initializer='he_uniform'))
        critic.add(Dense(self.value_size, activation='linear',
                         kernel_initializer='he_uniform'))
        critic.summary()
        critic.compile(loss="mse", optimizer=Adam(lr=self.lr_c))
        
        return critic


        
    def build_models(self):
        
        observation = Input(batch_shape=(None, self.obs_size))
        
        # Shared Stream
        #l1_shared = Dense(24,  activation='sigmoid', kernel_initializer='he_uniform')(observation)
        #l2_shared = Dense(8, activation='sigmoid', kernel_initializer='he_uniform')(l1_shared)
        
        

        # Actor Stream
        #l3_actor = Dense(8, activation='sigmoid', kernel_initializer='he_uniform')(l1_shared)
        #actor_output = Dense(self.action_size, activation='softmax', kernel_initializer='he_uniform')(l3_actor)

        # Critic Stream
        #l3_critic= Dense(8, activation='sigmoid', kernel_initializer='he_uniform')(l1_shared)
        #critic_output = Dense(self.value_size, activation='linear', kernel_initializer='he_uniform')(l3_critic)
        
        #model = Model(input=state_input, output=[actor, critic])
        
        
        observation = Input(shape=self.state_size)
        conv1 = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(observation)
        conv2 = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv1)
        conv3 = Flatten()(conv2)
        fc1 = Dense(256, activation='relu')(conv)
        policy = Dense(self.action_size, activation='softmax')(fc)
        value = Dense(1, activation='linear')(fc)

        actor = Model(inputs=observation, outputs=policy)
        critic = Model(inputs=observtion, outputs=value)

        actor.summary()
        critic.summary()

        return actor, critic

        optim_a = Adam(lr=self.lr_a)
        #optim_c = Adam(lr=self.lr_c)

        # the loss function of policy network is : log(action_prob) * advantages , which is form of cross entropy.
        actor.compile(loss='categorical_crossentropy', optimizer=optim_a)
        #critic.compile(loss='mse', optimizer=optim_c)

        actor.summary()
        critic.summary()
        
        return actor, critic

    def build_actor_optimizer(self):
        
        action = K.placeholder(shape=[None, self.action_size])
        advantages = K.placeholder(shape=[None, ])

        policy = self.actor.output

        good_prob = K.sum(action * policy, axis=1)
        eligibility = K.log(good_prob + 1e-10) * advantages
        actor_loss = -K.sum(eligibility)

        entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
        entropy = K.sum(entropy)

        loss = actor_loss + 0.01*entropy
        optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01)
        updates = optimizer.get_updates(self.actor.trainable_weights, [], loss)
        train = K.function([self.actor.input, action, advantages], [loss], updates=updates)

        return train

    def build_critic_optimizer(self):
        
        target = K.placeholder(shape=(None, 1))

        value = self.critic.output

        loss_c = K.mean(K.square(target - value)) #mse
        #loss_c = K.mean(K.square(target))

        optimizer = Adam(lr=self.lr_c)
        updates = optimizer.get_updates(self.critic.trainable_weights, [], loss_c)

        train = K.function([self.critic.input, target], [], updates=updates)

        return train
        
    
    
    def update(self, state, action, reward, next_state, done):

        # update policy network every episode
        
        target = np.zeros((1, self.value_size))          #target value to train the critic
        advantages = np.zeros((1, self.action_size))     #advantage of the action, to train the actor
        
        state = np.reshape(state, (1, self.obs_size))
        next_state = np.reshape(next_state, [1, self.obs_size])
        
        value = self.critic.predict(state)[0]            #value of current state
        next_value = self.critic.predict(next_state)[0]  #value of the next state
        

        if done:
            advantages[0][action] = reward - value
            target[0][0] = reward
        else:
            advantages[0][action] = reward + self.gamma * (next_value) - value   # TD learning:
            target[0][0] = reward + self.gamma * next_value  # target = Rt + gamma*V(St+1)
        
        
        
        if self.logging:
            
            print("state ", state)
            print("values: ", value, next_value)    
            print("adv: ", advantages)
            print("target :", target)
            print("---------------------")

        # or /w the optimizers
        #self.actor_optimizer([state, action, advantages])
        self.critic_optimizer([state, target])
        self.actor.fit(state, advantages, epochs=1, verbose=0)
        #self.critic.fit(state, target, epochs=1, verbose=0)
        
    
    def get_action(self, state):
        
        policy = self.actor.predict(np.reshape(state, [1, self.obs_size])).flatten()      #array with probs. of taking every action
        action = np.random.choice(self.action_size, 1, p=policy)[0]     #sampling from policys distribution
        if self.logging:             
            print("policy:", policy, " --> action:", action)
        return action
    
    
    def save_weights(self):
        if self.save_checkpoint:
            self.actor.save_weights(self.save_destination + "weights_actor.h5")
            self.critic.save_weights(self.save_destination + "weights_critic.h5")
        

In [10]:
import gym

class Agent:

    def __init__(self, env_name , lr_a, lr_c, gamma, epsilon):

        self.env = gym.make(env_name)
        self.scores = []

        self.action_size = self.env.action_space.n
        self.obs_dims = self.env.observation_space.shape[0]
        self.render = False

        self.lr_a = lr_a
        self.lr_c = lr_c
        self.gamma = gamma
        self.epsilon = epsilon
        

        self.networks = A2C(self.lr_a, self.lr_c, self.gamma, self.epsilon, self.obs_dims, self.action_size)

    def act(self, state):
        return self.networks.get_action(state)

    def train(self, EPISODES, plot=False):
        
        
        for e in range(EPISODES):

            done = False
            score = 0
            state = self.env.reset()
            
            while not done: 

                if self.render == True:
                    self.env.render()

                action = self.act(state)
                next_state, reward, done, info = self.env.step(action)
                #reward = reward if not done else -100   #or this is the last timestep in the episode
                self.networks.update(state, action, reward, next_state, done)

                score += reward
                state = next_state

                if done:
                # every episode, plot the play time
                    self.scores.append(score)
                    print("episode:", e, "  score:", score)

                if e%500:
                    self.networks.save_weights()


        self.networks.save_weights()



In [11]:
a = Agent("Breakout-v3", 0.01, 0.01, 0.99, 1)

[2018-05-04 17:11:29,380] Making new env: Breakout-v3


DependencyNotInstalled: No module named 'atari_py'. (HINT: you can install Atari dependencies by running 'pip install gym[atari]'.)

In [5]:
import matplotlib.pyplot as plt

def plot_MA(scores, ma=10):
    #plotting

    x, y = [], []
    maxes  = []
    temp = []
    moving_avg =[]
    m_x = []

    scores = n.scores
    for i in range(len(scores)):
        temp.append(scores[i])
        m_x.append(i+1)
        if i % ma == 0:
        #    y.append(np.mean(temp))
            maxes.append(max(temp))
            temp = []
            x.append(i+1)
        if i < ma:
            moving_avg.append(scores[i])
        else:
            moving_avg.append(np.mean(scores[i-ma:i]))



    #y.append(scores[len(scores)-1])
    #x.append(i+1)

    #plt.plot(x, y)
    plt.plot(m_x, moving_avg)
    plt.scatter(x, maxes)



In [6]:
plt.show()