# Laboratorium 6

Celem szóstego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmu głębokiego uczenia aktywnego - REINFORCE. Zaimplementowany algorytm będzie testowany z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [23]:
from collections import deque
import gym
import numpy as np
import random

Dołączenie bibliotek do obsługi sieci neuronowych

In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.math import log
from tensorflow.math import reduce_sum
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

## Zadanie 1 - REINFORCE

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu REINFORCE. Wagi sieci aktualizowane są zgodnie ze wzorem:
\begin{equation*}
    \theta \leftarrow \theta + \alpha G_t \nabla_\theta log \pi_{\theta}(a_t, s_t | \theta)
\end{equation*}.
</p>

In [25]:
def get_cumulative_rewards(rewards,  # rewards at each step
                           gamma=0.99  # discount for reward
                           ):
    """
    based on https://github.com/yandexdataschool/Practical_RL/blob/spring20/week06_policy_based/reinforce_tensorflow.ipynb
    take a list of immediate rewards r(s,a) for the whole session
    compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)
    R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

    The simple way to compute cumulative rewards is to iterate from last to first time tick
    and compute R_t = r_t + gamma*R_{t+1} recurrently

    You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
    """

    cumulative_rewards = []
    for i in range(len(rewards)):
        reward = 0
        for j in range(len(rewards)):
            if i + j < len(rewards):
                reward += rewards[i + j] * (gamma ** j)
        cumulative_rewards.append(reward)
    
    return cumulative_rewards


assert len(get_cumulative_rewards(range(100))) == 100
assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),
                   [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
assert np.allclose(get_cumulative_rewards([0, 0, 1, -2, 3, -4, 0], gamma=0.5),
                   [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
assert np.allclose(get_cumulative_rewards([0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])

In [26]:
class REINFORCEAgent:
    def __init__(self, action_size, predict_model, policy_model):
        self.action_size = action_size
        self.action_space = [i for i in range(action_size)]
        self.gamma = 0.99    # discount rate
        self.learning_rate = 0.001
        self.predict_model = predict_model
        self.policy_model = policy_model
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        
        
    def store_transition(self, state, action, reward):
        #Function adds information to the memory about last action and its results
        self.state_memory.append(state)
        self.action_memory.append(action)
        self.reward_memory.append(reward)


    def choose_action(self, state):
        """
        Compute the action to take in the current state, basing on policy returned by the network.

        Note: To pick action according to the probability generated by the network
        """

        #
        # INSERT CODE HERE to get action in a given state
        #        
        state = state[np.newaxis, :]
        probabilities = self.predict_model.predict(state, verbose = 0)[0]
        chosen_action = np.random.choice(self.action_space, p=probabilities)
        return chosen_action

  

    def replay(self, batch_size):
        """
        Function learn network using data stored in state, action and reward memory. 
        First calculates G_t for each state and train network
        """
        #
        # INSERT CODE HERE to train network
        #
        
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        actions = np.zeros([len(action_memory), self.action_size])
        actions[np.arange(len(action_memory)), action_memory] = 1

        G = np.array(get_cumulative_rewards(reward_memory, self.gamma))
        self.policy_model.fit([state_memory, G], actions, batch_size = batch_size, verbose = 0)
        
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [27]:
def custom_loss(y_true, y_pred):
    log_lik = y_true * log(y_pred)
    return reduce_sum(-log_lik * G)

env = gym.make("CartPole-v1").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

input = Input(shape=(state_size,))
G = Input(shape=[1])
layer_1 = Dense(16, activation='relu')(input)
layer_2 = Dense(32, activation='relu')(layer_1)
probability = Dense(action_size, activation='softmax')(layer_2)
policy_model = Model(inputs=[input, G], outputs=[probability])
policy_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate), loss=custom_loss)
predict_model = Model(inputs=[input], outputs=[probability])

Przygotuj funkcję obliczającą wartość nagrody skumulowanej:

Czas nauczyć agenta gry w środowisku *CartPool*:

In [28]:
agent = REINFORCEAgent(action_size, predict_model, policy_model)
batch_size = 64

def generate_session(t_max=1000):
    """play env with REINFORCE agent and train at the session end"""

    reward = 0

    s = env.reset()[0]

    for t in range(t_max):

        # chose action
        a = agent.choose_action(s)

        new_s, r, done, info, _ = env.step(a)

        # record session history to train later
        agent.store_transition(s, a, r)

        reward += r

        s = new_s
        if done: break

    agent.replay(batch_size)

    return reward


for i in range(100):

    rewards = [generate_session() for _ in range(100)]  # generate new sessions

    print("mean reward:%.3f" % (np.mean(rewards)))

    if np.mean(rewards) > 300:
        print("You Win!")
        break

  updates=self.state_updates,


mean reward:27.170
mean reward:64.900
mean reward:118.490
mean reward:281.110
mean reward:667.960
You Win!
