In [1]:
import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Physical Devices: {}'.format(physical_devices))
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU memory limitated successfuly!')
except:
    print('Warning! GPU memory could not be limitated!')

Physical Devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU memory limitated successfuly!


In [2]:
from keras import backend as K
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import Adam
import numpy as np


class Agent(object):
    def __init__(self, alpha, beta, gamma=0.99, n_actions=4,
                 layer1_size=1024, layer2_size=512, input_dims=8):
        self.gamma = gamma
        self.alpha = alpha
        self.beta = beta
        self.input_dims = input_dims
        self.leyer1_size = layer1_size
        self.leyer2_size = layer2_size
        self.n_actions = n_actions
        
        self.actor, self.critic, self.policy = self.build_actor_critic_network()
        self.action_space = [i for i in range(self.n_actions)]
    
    def build_actor_critic_network(self):
        inputs = Input(shape=(self.input_dims,))
        delta = Input(shape=[1])
        dense1 = Dense(self.leyer1_size, activation='relu')(inputs)
        dense2 = Dense(self.leyer2_size, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)
        values = Dense(1, activation='linear')(dense2)
        
        actor = Model(inputs=[inputs, delta], outputs=[probs])
        actor.compile(optimizer=Adam(lr=self.alpha), loss='categorical_crossentropy')
        
        critic = Model(inputs=[inputs], outputs=[values])
        critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')
        
        policy = Model(inputs=[inputs], outputs=[probs])
        
        return actor, critic, policy
    
    def choose_action(self, observation):
        state = observation[np.newaxis, :]
        probabilities = self.policy.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)
        
        return action
    
    def learn(self, state, action, reward, new_state, done):
        state = state[np.newaxis, :]
        new_state = new_state[np.newaxis, :]
        
        critic_value = self.critic.predict(state)
        new_critic_value = self.critic.predict(new_state)
        
        target = reward + self.gamma * new_critic_value * (1 - int(done))
        
        delta = target - critic_value
        
        actions = np.zeros([1, self.n_actions])
        actions[np.arange(1), action] = 1.0
        
        self.actor.fit([state, delta], actions, verbose=0)
        self.critic.fit(state, target, verbose=0)


In [None]:
import gym
import numpy as np

agent = Agent(alpha=0.00001, beta=0.00005)

env = gym.make('LunarLander-v2')
score_history = []
num_episodes = 2000

for i in range(num_episodes):
    done = False
    score = 0
    observation = env.reset()
    
    print('Starting episode: ', i)
    while not done:
        action = agent.choose_action(observation)
        new_observation, reward, done, info = env.step(action)
        agent.learn(observation, action, reward, new_observation, done)
        observation = new_observation
        score += reward
        
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    print('episode ', i, 'score %.2f average score %.2f' % \
          (score, avg_score))

Starting episode:  0
episode  0 score -88.11 average score -88.11
Starting episode:  1
episode  1 score -93.65 average score -90.88
Starting episode:  2
episode  2 score -343.62 average score -175.13
Starting episode:  3
episode  3 score -233.19 average score -189.64
Starting episode:  4
episode  4 score -375.54 average score -226.82
Starting episode:  5
episode  5 score -26.83 average score -193.49
Starting episode:  6
episode  6 score -391.21 average score -221.74
Starting episode:  7
episode  7 score -150.61 average score -212.84
Starting episode:  8
episode  8 score -109.98 average score -201.41
Starting episode:  9
episode  9 score -150.12 average score -196.28
Starting episode:  10
episode  10 score -443.64 average score -218.77
Starting episode:  11
episode  11 score -133.66 average score -211.68
Starting episode:  12
episode  12 score -110.23 average score -203.88
Starting episode:  13
episode  13 score -109.24 average score -197.12
Starting episode:  14
episode  14 score -262.