In [None]:
from random import random, randint
import numpy as np
from classes import Bandit

import torch
from torch import nn, optim
from torch.nn import functional
from torch.autograd import Variable
from math import log

class a2c_network(nn.Module):
    
    def __init__(self, N_arms, state_dim, actor_layers, critic_layers, learning_rate ):
        super(a2c_network, self).__init__()
        
        self.actor = [ nn.Linear(state_dim, actor_layers[0]) ]
        for i in range(len(actor_layers)-1):
            self.actor.append( nn.Linear(actor_layers[i], actor_layers[i+1]) )
        self.actor.append( nn.Linear(actor_layers[-1], N_arms) )
        
        for i, layer in enumerate(self.actor):
            setattr(self, "actor"+str(i), layer)
        
        self.critic = [ nn.Linear(state_dim, critic_layers[0]) ]
        for i in range(len(critic_layers)-1):
            self.critic.append( nn.Linear(critic_layers[i], critic_layers[i+1]) )
        self.critic.append( nn.Linear(critic_layers[-1], 1) )
        
        for i, layer in enumerate(self.critic):
            setattr(self, "critic"+str(i), layer)
        
        
        
    def forward(self, state):
        policy = functional.relu(self.actor[0](state))
        for i in range(1, len(self.actor)-1):
            policy = functional.relu( self.actor[i](policy) )
        policy = functional.softmax( self.actor[-1](policy), dim=1)
        
        value = functional.relu(self.critic[0](state))
        for i in range(1, len(self.critic)-1):
            value = functional.relu( self.critic[i](value) )
        value = self.critic[-1](value) 
        
        return value, policy

def init_a2c(B, kwargs):
    B.state = env.reset()
    B.state = B.state.reshape(1,B.state.shape[0])
    B.state = torch.FloatTensor( B.state )
    B.done = False
    
    B.N_arms = kwargs['N_arms'] 
    B.lr = kwargs['lr']
    B.discount = kwargs['discount']
    B.maxlen = kwargs['maxlen']
    #B.entropy = 0
    
    B.Q = []
    B.logP = []
    B.history = []
    B.values = []
    B.policies = []
    B.entropy = 0
    
    B.model = a2c_network( B.N_arms, B.state.shape[1], kwargs['actor_layers'], kwargs['critic_layers'], kwargs['lr'])
    B.optim = optim.Adam( B.model.parameters(), lr = B.lr)
    

def a2c(B):
    value, policy = B.model.forward( B.state )
    B.pdist = policy
    B.values.append( value.detach().numpy()[0][0] )
    B.policies.append( np.squeeze( policy.detach().numpy() ) )
    return np.random.choice( B.N_arms, p=B.policies[-1] )


def update_a2c(B, arm, reward, observation):
    observation = observation.reshape(1, observation.shape[0])
    observation = torch.FloatTensor(observation)
    B.history.append( (B.state, arm, reward, observation) )
    B.state = observation
    
    B.logP.append( torch.log(B.pdist.squeeze(0)[arm]) )
    B.entropy -= np.sum( np.mean(B.policies[-1]) * np.log(B.policies[-1]) )
    
    if B.done or len(B.history) == B.maxlen:
        Q0, useless = B.model.forward( observation )
        Q0 = Q0.detach().numpy()[0][0]
        
        N = len(B.history)
        B.Q = [ Q0 for n in range(N) ]
        for i in range(N-1, 1, -1):
            B.Q[i-1] = B.history[i-1][2] + B.Q[i] * B.discount
        
        logP = torch.stack(B.logP)
        Q = torch.FloatTensor(B.Q)
        V = torch.FloatTensor( B.values )
        
        A = Q - V
        a_loss = ( -logP * A ).mean()
        c_loss = 0.5 * (A.pow(2)).mean()
        loss = a_loss + c_loss - 0.001*B.entropy 
        #loss.requires_grad = True
        
        #print( "  Loss:", loss.detach().numpy() )
        
        B.optim.zero_grad()
        loss.backward()
        B.optim.step()
        
        B.Q = []
        B.logP = []
        B.history = []
        B.values = []
        B.policies = []
        B.entropy = 0
        
    return


In [None]:
import gym

#env = gym.make("CartPole-v0")
env = gym.make('LunarLander-v2')
env.reset()

kwargs = { "N_arms": env.action_space.n, "discount": 0.99, "lr": 0.0003, "actor_layers": [256], "critic_layers": [256], "maxlen": 10000}

B = Bandit(env, a2c, update_a2c, init_a2c, kwargs )


rewards = []
all_weights = []
counter = 0
best_return = -200

interval = 25

for i in range(2500):
    
    done = False
    counter += 1
    if counter%interval == 0:
        print( counter, "\t", sum(rewards[-interval:])/float(interval) )
    total_return = 0
    env.reset()
    j = 0
    while not done:
        #env.render()
        arm = B.policy(B)
        observation, reward, done, info = env.step(arm)
        #reward = 1 if reward > 0 else -1 if reward < 0 else 0
        B.done = done
        j += 1
        if done:# or j > 1000:
            #reward -= 100
            done = True
        B.update(B, arm, reward, observation )
        total_return += reward
        if done:
            rewards.append( total_return )
            env.reset()
            if total_return > best_return:
                best_return = total_return
            break
    

env.close()
print( "Done after ", counter, " episodes" )
print(best_return)

In [None]:
import matplotlib.pyplot as plt

N, n = len(rewards), 5
rewards2 = [ sum(rewards[i-n:i+n])/float(2*n) for i in range(n,N-n) ]

fig, ax = plt.subplots( figsize=(10,5) )
ax.plot( rewards )
ax.plot( rewards2, c='red' )
fig.patch.set_facecolor('#212121')
ax.tick_params(colors='white')
ax.set_ylim(-500,300)
plt.show()

env.close()

In [None]:
from gym import wrappers

recorder = wrappers.monitoring.video_recorder.VideoRecorder(env, 'C:/Users/Zeyad/Desktop/Reinforcement Learning/videos/lunar_lander_a2c.mp4')

for i in range(1):
    #B.eps = 0.0
    done = False
    total_return = 0
    while not done:
        env.render()
        arm = B.policy(B)
        observation, reward, done, info = env.step(arm)
        B.update(B, arm, reward, observation)
        total_return += reward
        recorder.capture_frame()
        if done:
            rewards.append( total_return )
            env.reset()
            recorder.close()
            recorder.Enabled = False
            best_return = total_return
            break
env.close()

print( rewards[-10:] )

In [None]:
env.close()
recorder.close()

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten, add, subtract, multiply
from keras.optimizers import Adam, RMSprop
from keras.models import Model

import tensorflow.keras.backend as K
import tensorflow as tf

from random import sample
from collections import deque
from math import log

# Great resource for a2c:
#     https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f

# Why DQN uses a replay buffer and A2C doesn't:
#     https://www.reddit.com/r/reinforcementlearning/comments/br9hc3/can_i_use_a_replay_buffer_in_a2ca3c_why_not/

def a2c_loss(B):
    
    def loss_function(y_true,y_pred):
        K.get_variable_shape(y_pred)
        return K.variable(B.loss)

    return loss_function

def a2c_actor_loss(B):
    
    def loss_function(y_true,y_pred): 
        """
        B.advantage = subtract( [ K.variable(B.Q) , K.variable(B.values) ] )         
        logP = K.stack( B.logP )
        B.actor_loss = K.mean( multiply( [logP, B.advantage] ) )
        B.critic_loss = K.mean( multiply( [B.advantage, B.advantage] ) ) 
        B.loss = 0.001*B.entropy + B.actor_loss + B.critic_loss 
        """
        print( "entered loss function")
        if len(B.advantage) == 0: 
            print( "Returning 0 ")
            answer = np.ones( (1,1), dtype = np.float32 )
        else:
            B.advantage = B.Q - np.asarray(B.values)         
            logP = np.stack( B.logP )
            B.actor_loss = np.mean( -logP * B.advantage )
            print( "Returning summat else:", B.actor_loss.shape )
            answer = B.actor_loss
        return tf.convert_to_tensor( answer )

    return loss_function

def a2c_critic_loss(B):
    
    def loss_function(y_true,y_pred): 
        return K.variable(B.critic_loss)

    return loss_function

def init_a2c(B, kwargs):
    B.N_arms = kwargs['N_arms'] 
    B.context = env.reset()
    B.context = B.context.reshape(1,B.context.shape[0])
    B.done = False
    #B.entropy = 0
    
    B.Q = []
    B.logP = [ [] ]
    B.values = []
    B.actions = []
    B.policies = []
    B.advantage = []
    B.discount = kwargs['discount']
    B.maxlen = kwargs['maxlen']
    """
    B.actor = Sequential()
    B.actor.add(Dense(kwargs['layer_size'][0], input_dim=env.observation_space.shape[0], activation='relu'))
    B.actor.add(Dense(kwargs['N_arms'], activation='softmax', trainable = False))
    
    B.critic = Sequential()
    B.critic.add(Dense(kwargs['layer_size'][0], input_dim=env.observation_space.shape[0], activation='relu'))
    B.critic.add(Dense(1, activation='linear'))
    
    B.actor.compile(loss='mse', optimizer=Adam(lr=kwargs['lr']) )
    B.critic.compile(loss='mse', optimizer=Adam(lr=kwargs['lr']) )
    """
    inputs = Input( [ env.observation_space.shape[0] ] )
    #shared = Flatten()(inputs)
    shared = Dense(64, activation='relu')(inputs)
    shared = Dense(128, activation='relu')(shared)
    shared = Model(inputs, shared)
    
    actor_hidden = Dense(128, activation='relu')(shared.output)
    actor_out = Dense(kwargs['N_arms'], activation='softmax')(actor_hidden)
    B.actor = Model(shared.input, actor_out)
    
    critic_hidden = Dense(128, activation='relu')(shared.output)
    critic_out = Dense(1, activation='softmax')(critic_hidden)
    B.critic = Model(shared.input, critic_out)
    
    
    B.optimizer =  RMSprop(lr=kwargs['lr'])
    
    B.action_pl = K.placeholder(shape=(None, B.N_arms))
    B.advantages_pl = K.placeholder(shape=(None,))
    B.discounted_r = K.placeholder(shape=(None,))
    
    B.weighted_actions = K.sum(B.action_pl * B.actor.output, axis=1)
    B.eligibility = K.log(B.weighted_actions + 1e-10) * K.stop_gradient(B.advantages_pl)
    B.entropy = K.sum(B.actor.output * K.log(B.actor.output + 1e-10), axis=1)
    B.loss = 0.001 * B.entropy - K.sum(B.eligibility)
    
    updates = B.optimizer.get_updates(B.actor.trainable_weights, [], B.loss)
    B.actor_opt = K.function([B.actor.input, B.action_pl, B.advantages_pl], B.actor.output, updates=updates)
    
    B.critic_loss = K.mean( K.square( B.discounted_r - B.critic.output ) )
    updates = B.optimizer.get_updates(B.critic.trainable_weights, [], B.critic_loss)
    B.critic_opt = K.function([B.critic.input, B.discounted_r], B.critic.output, updates=updates)
    
    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    
    B.history = []
    B.logP = []
    B.entropy = 0
    return


def a2c(B):
    value = B.critic.predict( B.context )[0][0]
    policy = B.actor.predict( B.context )[0] 
    B.values.append( value )
    B.policies.append( policy )
    return np.random.choice( B.N_arms, p=policy )


def update_a2c(B, arm, reward, observation):
    observation = observation.reshape(1, observation.shape[0])
    #B.history.append( (B.context, arm, reward, observation) )
    B.history.append( (B.context, arm, reward, observation) )
    
    B.logP.append( -log( B.policies[-1][arm] + 1e-10 ) )
    #B.entropy += -np.sum( np.mean(B.policies[-1]) * np.log(B.policies[-1]) )
    
    if B.done or len(B.history) == B.maxlen:
        N = len(B.history)
        B.R = [ 0 for n in range(N) ]
        B.R[-1] = B.history[-1][2]
        for i in range(N-1, 0, -1):
            B.R[i-1] = B.history[i-1][2] + B.R[i] * B.discount
        
        B.R = np.asarray(B.R, dtype=np.float32)
        B.discounted_r = tf.convert_to_tensor(B.R)
        
        B.action_pl = tf.convert_to_tensor( B.policies[-1].reshape( (1,2) ) )
        B.advantages_pl = tf.convert_to_tensor( (B.R - np.asarray(B.values)).astype(np.float32) )
        
        weighted_actions = K.sum(B.action_pl * B.actor.output, axis=1)
        eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(B.advantages_pl)
        entropy = K.sum(B.actor.output * K.log(B.actor.output + 1e-10), axis=1)
        B.loss = 0.001 * entropy - K.sum(eligibility)
        
        c2t = lambda x: tf.convert_to_tensor(x)
        
        states, actions, rewards, next_states = zip(*B.history)
        states = c2t( np.squeeze( np.asarray(states,dtype=np.float32) ) )
        actions = c2t( np.squeeze( np.asarray(actions,dtype=np.float32) ) )
        rewards = c2t( np.squeeze( np.asarray(rewards,dtype=np.float32) ) )
        
        
        B.actor_opt( [states, actions, rewards] )
        B.critic_opt( [states, B.discounted_r] )
        
        """
        updates = B.optimizer.get_updates(B.actor.trainable_weights, [], B.loss)
        #prev = tf.convert_to_tensor( B.policies[-1].reshape( (1,2) ).astype( np.float32 ) )
        B.actor_opt = K.function([B.actor.input, B.action_pl, B.advantages_pl], B.actor.output, updates=updates)
        
        B.critic_loss = K.mean( K.square( B.discounted_r - B.critic.output ) )
        updates = B.optimizer.get_updates(B.critic.trainable_weights, [], B.critic_loss)
        #prev = tf.convert_to_tensor( B.values[-1].astype( np.float32 ) )
        B.critic_opt = K.function([B.critic.input, B.discounted_r], B.critic.output, updates=updates)
        """
        
        
        B.history = []
        B.logP = []
        B.values = []
        B.actions = []
        B.policies = []
        
    B.context = observation
    return


In [None]:
import gym

env = gym.make("CartPole-v0")
env.reset()

kwargs = { "N_arms": env.action_space.n, "discount": 0.99, "lr": 0.001, "layer_size": [256], "maxlen": 20000}

B = Bandit(env, a2c, update_a2c, init_a2c, kwargs )


rewards = []
all_weights = []
counter = 0
#for i in range(100000):
counter = 0
best_return = -200

interval = 1

for i in range(10000):
    done = False
    counter += 1
    if counter%interval == 0:
        print( counter, "\t", sum(rewards[-interval:])/float(interval) )
    total_return = 0
    env.reset()
    j = 0
    while not done:
        #env.render()
        arm = B.policy(B)
        observation, reward, done, info = env.step(arm)
        reward = 1 if reward > 0 else -1 if reward < 0 else 0
        B.done = done
        j += 1
        if done:# or j > 1000:
            #reward -= 100
            done = True
        B.update(B, arm, reward, observation )
        total_return += reward
        if done:
            rewards.append( total_return )
            env.reset()
            if total_return > best_return:
                best_return = total_return
            break
env.close()
print( "Done after ", counter, " episodes" )
print(best_return)

In [None]:
states, actions, rewards, next_states = zip(*B.history)

In [None]:
B.actor_opt.

In [None]:
torch.nn.Module

In [None]:
3e-4