# Goal

This notebook contains a selfmade REINFORCE algorithm implementation from Artificial Neural networks course. 
Wanted to use it as a baseline but it's not really adapted to the situation.

In [2]:
class BasicAgent(DeepracerAgent):
    def __init__(self, gamma=0.99, policy_learning_rate=0.002, value_learning_rate=0.002):

        self.plr = policy_learning_rate
        self.vlr = value_learning_rate
        self.gamma = gamma

        self.actions_prob = []
        self.saved_rewards = []
        self.model = None
        self.value_model = None
        self.n_moving_avg = 5

        # These lists stores the cumulative observations for this episode
        self.episode_observations, self.episode_actions, self.episode_rewards = [], [], []

        # Build the keras network
        self._build_network()

    def _convert_obs(self, observation):
        return segment_resize(observation['STEREO_CAMERAS'][:, :, 0])

    def observe(self, state, action, reward):
        """ This function takes the observations the agent received from the environment and stores them
            in the lists above."""
        self.episode_observations.append(
            self._convert_obs(state)[:, :, np.newaxis])
        self.episode_actions.append(action)
        self.episode_rewards.append(reward)

    def decide(self, state):
        """ This function feeds the observed state to the network, which returns a distribution
            over possible actions. Sample an action from the distribution and return it."""

        probs = np.ravel(self.model(self._convert_obs(state)
                         [np.newaxis, :, :, np.newaxis]))
        return np.random.choice(len(probs), p=probs)

    def register_reset(self, observations):
        action = np.random.randint(5)
        return action

    def compute_action(self, observations, info):
        action = np.random.randint(5)
        return action

    def train(self):
        """ When this function is called, the accumulated episode observations, actions and discounted rewards
            should be fed into the network and used for training. Use the _get_returns function to first turn 
            the episode rewards into discounted returns. 
            Apply simple or adaptive baselines if needed, depending on parameters."""

        states = np.stack(self.episode_observations)
        discounted = self._get_returns().reshape((-1, 1))

        # compute baseline and train
        baseline = self.value_model.predict_on_batch(states)
        self.value_model.train_on_batch(states, discounted.reshape((-1, 1)))

        # compute discounted rewards and remove baseline
        discounted_rewards = discounted - baseline

        # train model
        self.model.train_on_batch(
            states,
            tf.keras.utils.to_categorical(self.episode_actions, num_classes=5),
            sample_weight=discounted_rewards
        )
        # reset observations for next episode
        self.episode_observations, self.episode_actions, self.episode_rewards = [], [], []

    def moving_average(self, discounted):
        # save the mean of the rewards for the moving average
        self.saved_rewards.append(np.mean(discounted))
        # only keep last n averages
        if len(self.saved_rewards) > self.n_moving_avg:
            self.saved_rewards = self.saved_rewards[1:]
        # compute baseline from the moving average
        return np.mean(self.saved_rewards)

    def _get_returns(self):
        """ This function should process self.episode_rewards and return the discounted episode returns
            at each step in the episode. Hint: work backwards."""

        discounted_rewards = []
        cumulative_total_return = 0
        # iterate the rewards backwards and and calc the total return
        for reward in self.episode_rewards[::-1]:
            cumulative_total_return = (
                cumulative_total_return*self.gamma)+reward
            discounted_rewards.append(cumulative_total_return)
        # reverse sequence
        discounted_rewards.reverse()
        # convert to numpy array
        return np.array(discounted_rewards)

    def _build_network(self):
        """ This function should build the network that can then be called by decide and train. 
            The network takes observations as inputs and has a policy distribution as output."""

        model = Sequential()
        model.add(Conv2D(32, (3, 3), activation='relu',
                  kernel_initializer='he_uniform', input_shape=(16, 16, 1)))
        model.add(MaxPooling2D((2, 2)))
        model.add(Flatten())
        model.add(Dense(20, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(5, activation='softmax'))
        model.compile(loss="categorical_crossentropy",
                      optimizer=Adam(learning_rate=self.plr))

        value_model = Sequential()
        value_model.add(Conv2D(32, (3, 3), activation='relu',
                        kernel_initializer='he_uniform', input_shape=(16, 16, 1)))
        value_model.add(MaxPooling2D((2, 2)))
        value_model.add(Flatten())
        value_model.add(Dense(20, activation='relu',
                        kernel_initializer='he_uniform'))
        value_model.add(Dense(1, activation='softmax'))
        value_model.compile(loss='mean_squared_error',
                            optimizer=Adam(learning_rate=self.vlr))

        self.model = model
        self.value_model = value_model

ModuleNotFoundError: No module named 'agents'