In [2]:
# For Development and debugging:
# Reload modul without restarting the kernel
%load_ext autoreload
%autoreload 2

In [3]:
from libs.Utils import plot_history
import os
path = '/home/hhughes/Documents/TUM_Subjects/S5/Seminar/Seminar_Project'
os.makedirs(os.path.join(path, 'Models'), exist_ok=True)
model_path = os.path.join(path, 'Models', 'Basic_Actor_Critic')
os.makedirs(model_path, exist_ok=True)

In [4]:
import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print('Physical Devices: {}'.format(physical_devices))
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU memory limitated successfuly!')
except:
    print('Warning! GPU memory could not be limitated!')

Physical Devices: []


In [22]:
from keras import backend as K
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import Adam
import numpy as np


class Agent(object):
    def __init__(self, alpha, beta, gamma=0.99, n_actions=4,
                 layer1_size=1024, layer2_size=512, input_dims=8):
        self.gamma = gamma
        self.alpha = alpha
        self.beta = beta
        self.input_dims = input_dims
        self.leyer1_size = layer1_size
        self.leyer2_size = layer2_size
        self.n_actions = n_actions
        
        self.actor, self.critic, self.policy = self.build_actor_critic_network()
        self.action_space = [i for i in range(self.n_actions)]
    
    def build_actor_critic_network(self):
        inputs = Input(shape=(self.input_dims,))
        delta = Input(shape=[1])
        dense1 = Dense(self.leyer1_size, activation='relu')(inputs)
        dense2 = Dense(self.leyer2_size, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)
        values = Dense(1, activation='linear')(dense2)
        
        actor = Model(inputs=[inputs, delta], outputs=[probs])
        actor.compile(optimizer=Adam(lr=self.alpha), loss='categorical_crossentropy')
        
        critic = Model(inputs=[inputs], outputs=[values])
        critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')
        
        policy = Model(inputs=[inputs], outputs=[probs])
        
        return actor, critic, policy
    
    def choose_action(self, observation):
        state = observation[np.newaxis, :]
        probabilities = self.policy.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)
        
        return action
    
    def learn(self, state, action, reward, new_state, done):
        state = state[np.newaxis, :]
        new_state = new_state[np.newaxis, :]
        
        critic_value = self.critic.predict(state)
        new_critic_value = self.critic.predict(new_state)
        
        target = reward + self.gamma * new_critic_value * (1 - int(done))
        
        delta = target - critic_value
        
        actions = np.zeros([1, self.n_actions])
        actions[np.arange(1), action] = 1.0
        
        self.actor.fit([state, delta], actions, verbose=0)
        self.critic.fit(state, target, verbose=0)


# The Environment
## OpenAI Gym

Gym is a toolkit for developing and comparing reinforcement learning algorithms. It supports teaching agents everything from walking to playing games like Pong or Pinball.

### LunarLander-v2 environment

Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main engine is -0.3 points each frame. Solved is 200 points. Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt. Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire right orientation engine.

In [14]:
from IPython.display import Video
Video('Moon_Lander.mp4')

In [23]:
import gym
import numpy as np
from libs.Utils import plot_history

agent = Agent(alpha=0.00001, beta=0.00005)

env = gym.make('LunarLander-v2')
score_history = []
avg_score_history = []
num_episodes = 2

actions = []
for i in range(num_episodes):
    done = False
    score = 0
    observation = env.reset()
    
    print('Starting episode: ', i)
    while not done:
        action = agent.choose_action(observation)
        new_observation, reward, done, info = env.step(action)
        agent.learn(observation, action, reward, new_observation, done)
        observation = new_observation
        score += reward
        
        actions.append(action)
        
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    avg_score_history.append(avg_score)
    plot_history(score_history, avg_score_history, os.path.join(model_path, 'plot'))
    print('episode ', i, 'score %.2f average score %.2f' % \
          (score, avg_score))

Starting episode:  0
[[0. 1. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 0. 0. 1.]]
[[1. 0. 0. 0.]]
[[1. 0. 0. 0.]]
[[1. 0. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 0. 1. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[1. 0. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 0. 1. 0.]]
[[1. 0. 0. 0.]]
[[1. 0. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 0. 1. 0.]]
[[0. 0. 1. 0.]]
[[0. 0. 1. 0.]]
[[0. 1. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 0. 1. 0.]]
[[1. 0. 0. 0.]]
[[1. 0. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 0. 1. 0.]]
[[0. 1. 0. 0.]]
[[1. 0. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 0. 0. 1.]]
[[0. 0. 0. 1.]]
[[0. 1. 0. 0.]]
[[0. 0. 0. 1.]]
[[1. 0. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 1. 0. 0.]]
[[0. 0. 0. 1.]]
[[1. 0. 0. 0.]]
[[0. 0. 1. 0.]]
[[1. 0. 0. 0.]]
[[0. 0. 1. 0.]]
[[0. 1. 0. 0.]]
[[1. 0. 0. 0.]]
[[0. 0. 0. 1.]]
[[0. 0. 0. 1.]]
[[0. 0. 0. 1.]]
[[0. 0. 1. 0.]]
[[1. 0. 0. 0.]]
[[0. 0. 1. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 1. 0. 0.]]
episode  0 score -117.47 average sc

In [21]:
np.unique(np.array(actions))

array([0, 1, 2, 3])

In [None]:
# Save history
np.savez(os.path.join(model_path, 'history'), score_history=score_history, avg_score_history=avg_score_history)

# save models


In [9]:
action = agent.choose_action(observation)
action

0

In [10]:
observation

array([ 0.71006507, -0.12863599,  1.6626164 , -0.7748509 , -1.8214186 ,
       -5.648716  ,  1.        ,  0.        ], dtype=float32)