In [1]:
import numpy as np
import tensorflow as tf

import sim

In [2]:
ep = 1000
steps_per_ep = 200

learning_rate = 0.001
discount_factor = 0.99
exploration_prob = 1.0
exploration_decay = 0.995
min_exploration_prob = 0.1

In [3]:
class DQN(tf.keras.Model):
    def __init__(self, n_actions):
        super(DQN, self).__init__()
        self.dense1 = tf.keras.layers.Dense(24, activation='relu')
        self.dense2 = tf.keras.layers.Dense(24, activation='relu')
        self.output_layer = tf.keras.layers.Dense(
            n_actions, activation='linear')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# POSSIBLE_ACTIONS = ["0a", "0b", "1a", "1b", "2a", "2b", "2c", "2d"]
num_actions = 8
dqn_agent = DQN(num_actions)

In [4]:
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [5]:
simulator = sim.Sim(steps_per_ep)

for episode in range(ep):
    state = simulator.reset()
    ep_idle = 0

    for step in range(steps_per_ep):
        if np.random.rand() < exploration_prob:
            action = simulator.sample()
        else:
            # wrong array sizes? - also change input from str to float
            action = np.argmax(dqn_agent(state[np.newaxis, :]))

        next_state, reward, done = simulator.step(action)

        with tf.GradientTape() as tape:
            current_q_values = dqn_agent(state[np.newaxis, :])
            next_q_values = dqn_agent(next_state[np.newaxis, :])
            max_next_q = tf.reduce_max(next_q_values, axis=-1)
            target_q_values = current_q_values.numpy()
            target_q_values[0, action] = reward + discount_factor * max_next_q * (1 - done)
            loss = loss_fn(current_q_values, target_q_values)

        gradients = tape.gradient(loss, dqn_agent.trainable_variables)
        optimizer.apply_gradients(zip(gradients, dqn_agent.trainable_variables))

        state = next_state
        ep_idle += reward

        if done:
            break

    exploration_prob = max(min_exploration_prob, exploration_prob * exploration_decay)
    if (episode + 1)%100==0:
        print(f"Episode {episode + 1}: Reward = {ep_idle}")

[[], [], [], []]
[]
Road 1:  []
Road 2:  []
Road 3:  []
Road 4:  []
[[], [], [], []]
[]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices