In [1]:
try:
    import mlagents
    print("ml-agents already installed")
except ImportError:
    !python -m pip install -q mlagents==0.27.0
    print("Installed ml-agents")

ml-agents already installed


# Q-Learning Keras

In [2]:
import random
import tensorflow as tf
import numpy as np
import datetime
import traceback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, LayerNormalization
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.losses import Huber, LogCosh, CategoricalCrossentropy, MeanSquaredError
from tensorflow.keras.callbacks import TensorBoard

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.environment import ActionTuple, BaseEnv

print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Deep Q-Learning

In [3]:
# Configuration paramaters for the whole setup
gamma = 0.01  # Discount factor for past rewards
epsilon = 0.5  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
max_steps_per_episode = 10000

behavior_name = 'AlphaRomeo?team=0'

In [4]:
num_inputs = 15
num_actions = 8

num_hidden = 512

def create_q_model():
    with tf.device('GPU:0'):
        # Convolutions on the frames on the screen
        inputs = Input(shape=(num_inputs,))
        normal_inputs = LayerNormalization()
        common = Dense(num_hidden, activation="linear")(normal_inputs(inputs))
#         common2 = Dense(num_hidden*3, activation="linear")(common)
#         common3 = Dense(num_hidden, activation="linear")(common2)
        action = Dense(num_actions, activation="linear")(common)

        return Model(inputs=inputs, outputs=action)


# The first model makes the predictions for Q-values which are used to
# make a action.
model = create_q_model()
# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = create_q_model()

In [5]:
# In the Deepmind paper they use RMSProp however then Adam optimizer
# improves training time
optimizer = Adam(learning_rate=0.01, clipnorm=1.0)

done=False
# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames for exploration
epsilon_greedy_frames = 20
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 50
# Using huber loss for stability
loss_function = Huber()

In [6]:
env = UnityEnvironment(file_name="Alpha Romeo", seed=1, side_channels=[])
log_folder = 'logs/{}'.format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
writer = tf.summary.create_file_writer(log_folder)

# tf.debugging.experimental.enable_dump_debug_info(log_folder, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1)

try:
    while True:  # Run until solvedloss_function
        with writer.as_default():
            env.reset()

            decision_steps, terminal_steps = env.get_steps(behavior_name)
            if decision_steps:
                state = decision_steps[0].obs[0]
            if terminal_steps:
                done = True                
                state = terminal_steps[0].obs[0]

            episode_reward = 0

            for timestep in range(1, max_steps_per_episode):
                # env.render(); Adding this line would show the attempts
                # of the agent in a pop up window.
                frame_count += 1

                # Use epsilon-greedy for exploration
                if  epsilon > np.random.rand(1)[0]:
                    # Take random action
                    action = random.choice([1,2,3,4,5,6,7,8])
                    
                else:
                    # Predict action Q-values
                    # From environment state
                    state_tensor = tf.convert_to_tensor(state)
                    state_tensor = tf.expand_dims(state_tensor, 0)
                    action_probs = model(state_tensor, training=False)
                    # Take best action
                    action = action_probs[0]
                    action = int(tf.argmax(action)+1)
                    

                # Decay probability of taking random action
                epsilon -= epsilon_interval / epsilon_greedy_frames
                epsilon = max(epsilon, epsilon_min)

                # Apply the sampled action in our environment
        #         state_next, reward, done, _ = env.step(action)
                action_tuple = ActionTuple()
                action_tuple.add_discrete(np.array([[action]]))
                env.set_actions(behavior_name, action_tuple)
                # Perform a step in the simulation
                env.step()

                decision_steps, terminal_steps = env.get_steps(behavior_name)

                if terminal_steps:
                    done = True
                    env.reset()
                    decision_steps, terminal_steps = env.get_steps(behavior_name)
                    state_next = decision_steps[0].obs[0]
    #                 decision_steps, terminal_steps = env.get_steps(behavior_name)
                    break

                else:
                    state_next = decision_steps[0].obs[0]


                reward = decision_steps[0].reward
        #         state_next = np.array(state_next)

                episode_reward += reward
                # Save actions and states in replay buffer
                action_history.append(action)
                state_history.append(state)
                state_next_history.append(state_next)
                done_history.append(done) 
                rewards_history.append(reward)
                state = state_next

                # Update every fourth frame and once batch size is over 32


                # Get indices of samples for replay buffers
                #indice = np.random.choice(range(len(done_history)))

                state_sample = state_history[0]
                state_next_sample = np.array(state_next_history)
                
                rewards_sample = rewards_history
                action_sample = action_history
                done_sample = tf.convert_to_tensor([float(done) for done in done_history])

                # Build the updated Q-values for the sampled future states
                # Use the target model for stability
                state_next_sample = tf.reshape(state_next_sample, [-1, num_inputs])
                future_rewards = model_target.predict(np.array(state_next_sample))
                
                # Q value = reward + discount factor * expected future reward
                updated_q_values = rewards_sample + gamma * tf.reduce_max(
                    future_rewards, axis=1
                )

                # If final frame set the last value to -1
                updated_q_values = updated_q_values * (1 - done_sample) - done_sample

                # Create a mask so we only calculate loss on the updated Q-values

                masks = tf.one_hot(action_sample, num_actions)

                with tf.GradientTape() as tape:
                    # Train the model on the states and updated Q-values
                    q_values = model(np.array([state_sample]))
                    

                    # Apply the masks to the Q-values to get the Q-value for action taken
                    q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)

                    # Calculate loss between new Q-value and old Q-value
                    
                    loss = loss_function(updated_q_values, q_action)

                # Backpropagation
                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))


            # update the the target network with new weights
            model_target.set_weights(model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
    #         print(template.format(running_reward, episode_count, frame_count))
            tf.summary.scalar(name="reward", data=running_reward, step=episode_count)
            dqn_variable = model.trainable_variables
            tf.summary.histogram(name="dqn_variables", data=tf.convert_to_tensor(dqn_variable[0]), step=episode_count)
            writer.flush()
            # Limit the state and reward history
            if len(rewards_history) > max_memory_length:
                del rewards_history[:1]
                del state_history[:1]
                del state_next_history[:1]
                del action_history[:1]
                del done_history[:1]

            if done:
                env.reset()
                done = False


            # Update running reward to check condition for solving
            episode_reward_history.append(episode_reward)
            if len(episode_reward_history) > 100:
                del episode_reward_history[:1]
            running_reward = np.mean(episode_reward_history)

            episode_count += 10

            if running_reward > 500000:  # Condition to consider the task solved
                print("Solved at episode {}!".format(episode_count))
                break
except:
    print(traceback.print_exc())
    writer.close()
    env.close()

Traceback (most recent call last):
  File "C:\Users\carlo\AppData\Local\Temp/ipykernel_31684/933838534.py", line 96, in <module>
    future_rewards = model_target.predict(np.array(state_next_sample))
  File "C:\Users\carlo\anaconda3\lib\site-packages\keras\engine\training.py", line 1751, in predict
    tmp_batch_outputs = self.predict_function(iterator)
  File "C:\Users\carlo\anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 885, in __call__
    result = self._call(*args, **kwds)
  File "C:\Users\carlo\anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 924, in _call
    results = self._stateful_fn(*args, **kwds)
  File "C:\Users\carlo\anaconda3\lib\site-packages\tensorflow\python\eager\function.py", line 3039, in __call__
    return graph_function._call_flat(
  File "C:\Users\carlo\anaconda3\lib\site-packages\tensorflow\python\eager\function.py", line 1963, in _call_flat
    return self._build_call_outputs(self._inference_function.call(


None


In [143]:
!kill 26364

'kill' nÆo ‚ reconhecido como um comando interno
ou externo, um programa oper vel ou um arquivo em lotes.


In [94]:
env.set_actions('AlphaRomeo?team=0', action_tuple)

In [159]:
writer.close()

In [36]:
env.close()

UnityEnvironmentException: No Unity environment is loaded.