# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [123]:
from collections import deque
import gym
import numpy as np
import random
from tqdm import tqdm

Dołączenie bibliotek ze środowiskami:

In [124]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import FrozenLakeMDPExtended


Dołączenie bibliotek do obsługi sieci neuronowych

In [125]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD


## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [126]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = learning_rate
        self.model = model

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        #        
        epsilon = self.epsilon
        if random.random() >= epsilon:
            chosen_action = self.get_best_action(state)

        else:
            chosen_action = random.randrange(self.action_size)

        return chosen_action
  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        #
        # INSERT CODE HERE to get best possible action in a given state (remember to break ties randomly)
        #
        # prediction = self.model.predict(state, verbose=0)
        # max_action = max(prediction[0])
        # best_actions = []
        # for action in prediction[0]:
        #     if action == max_action:
        #         best_actions.append(list(prediction[0]).index(action))

        # best_action = random.choice(best_actions)
        # return best_action
        prediction = self.model.predict(state, verbose=0)
        best_action = np.argmax(prediction)
        return best_action
       

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """
        #
        # INSERT CODE HERE to train network
        #
        if len(self.memory) < batch_size:
            return

        batches = random.sample(self.memory, batch_size)
        states = np.concatenate([batch[0] for batch in batches])
        next_states = np.concatenate([batch[3] for batch in batches])
        Q_array = self.model.predict(states, verbose = 0)
        Q_next_state_array = self.model.predict(next_states, verbose = 0)

        for i, batch in enumerate(batches):
            _, action, reward, _, done = batch
            Q_array[i][action] = reward
            if not done:
                Q_next_state = max(Q_next_state_array[i])
                Q_array[i][action] = reward + Q_next_state * self.gamma

        self.model.train_on_batch(states, Q_array)

        # states_list = []
        # Q_list = []
        # for batch in batches:
        #     state, action, reward, next_state, done = batch
        #     states_list.append(np.array(state).flatten())
        #     Q = self.model.predict(state, verbose = 0)
        #     Q[0][action] = reward
        #     if not done:
        #         Q_next_state = max(self.model.predict(next_state, verbose = 0)[0])
        #         Q[0][action] = reward + Q_next_state * self.gamma
                
        #     Q_list.append(np.array(Q).flatten())

        # states_array = np.array(states_list)
        # Q_array = np.array(Q_list)

        # self.model.train_on_batch(states_array, Q_array)
        # self.update_epsilon_value()

    def update_epsilon_value(self):
        #Every each epoch epsilon value should be updated according to equation: 
        #self.epsilon *= self.epsilon_decay, but the updated value shouldn't be lower then epsilon_min value
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        else:
            self.epsilon = self.epsilon_min

In [127]:
list = [5,6,7,8]

for i, element in enumerate(list):
    print(i, element)

0 5
1 6
2 7
3 8


Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [128]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = Sequential()
model.add(Dense(16, input_dim = state_size, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(action_size))
model.compile (loss ='mse', optimizer = Adam(learning_rate = learning_rate))
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48 (Dense)            (None, 16)                1040      
                                                                 
 dense_49 (Dense)            (None, 32)                544       
                                                                 
 dense_50 (Dense)            (None, 4)                 132       
                                                                 
Total params: 1,716
Trainable params: 1,716
Non-trainable params: 0
_________________________________________________________________


 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [129]:
def one_state(env_state):
    state_size = env.get_number_of_states()
    state = np.zeros((1, state_size))
    state[:, env_state] =  1
    state = tf.convert_to_tensor(state, dtype=tf.float32)
    return state


In [130]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 10000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100)):
        total_reward = 0
        env_state = env.reset()
        
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #

        state = one_state(env_state)
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = one_state(next_state_env)

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
        
        agent.update_epsilon_value()
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

100%|██████████| 100/100 [00:50<00:00,  1.99it/s]


epoch #0	mean reward = 0.000	epsilon = 0.679


100%|██████████| 100/100 [00:42<00:00,  2.38it/s]


epoch #1	mean reward = 0.000	epsilon = 0.614


100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


epoch #2	mean reward = 0.010	epsilon = 0.556


100%|██████████| 100/100 [00:57<00:00,  1.73it/s]


epoch #3	mean reward = 0.000	epsilon = 0.503


100%|██████████| 100/100 [01:03<00:00,  1.57it/s]


epoch #4	mean reward = 0.000	epsilon = 0.455


100%|██████████| 100/100 [00:51<00:00,  1.94it/s]


epoch #5	mean reward = 0.020	epsilon = 0.411


100%|██████████| 100/100 [02:55<00:00,  1.75s/it]


epoch #6	mean reward = 0.080	epsilon = 0.372


100%|██████████| 100/100 [01:34<00:00,  1.06it/s]


epoch #7	mean reward = 0.030	epsilon = 0.337


100%|██████████| 100/100 [03:40<00:00,  2.21s/it]


epoch #8	mean reward = 0.190	epsilon = 0.305


100%|██████████| 100/100 [02:07<00:00,  1.28s/it]


epoch #9	mean reward = 0.260	epsilon = 0.276


100%|██████████| 100/100 [01:58<00:00,  1.18s/it]


epoch #10	mean reward = 0.520	epsilon = 0.250


100%|██████████| 100/100 [02:11<00:00,  1.31s/it]


epoch #11	mean reward = 0.630	epsilon = 0.226


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


epoch #12	mean reward = 0.530	epsilon = 0.204


100%|██████████| 100/100 [03:46<00:00,  2.26s/it]


epoch #13	mean reward = 0.560	epsilon = 0.185


100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


epoch #14	mean reward = 0.720	epsilon = 0.167


100%|██████████| 100/100 [03:19<00:00,  2.00s/it]


epoch #15	mean reward = 0.690	epsilon = 0.151


100%|██████████| 100/100 [03:40<00:00,  2.21s/it]


epoch #16	mean reward = 0.530	epsilon = 0.137


100%|██████████| 100/100 [02:59<00:00,  1.79s/it]


epoch #17	mean reward = 0.710	epsilon = 0.124


100%|██████████| 100/100 [02:42<00:00,  1.62s/it]


epoch #18	mean reward = 0.690	epsilon = 0.112


100%|██████████| 100/100 [01:09<00:00,  1.45it/s]


epoch #19	mean reward = 0.770	epsilon = 0.101


100%|██████████| 100/100 [00:48<00:00,  2.06it/s]


epoch #20	mean reward = 0.890	epsilon = 0.092


100%|██████████| 100/100 [00:49<00:00,  2.02it/s]

epoch #21	mean reward = 0.910	epsilon = 0.083
You Win!





Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [131]:
env = FrozenLakeMDPExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = Sequential()
model.add(Dense(16, input_dim = (state_size  * 3), activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(action_size))
model.compile (loss ='mse', optimizer = Adam(learning_rate = learning_rate))
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_51 (Dense)            (None, 16)                784       
                                                                 
 dense_52 (Dense)            (None, 32)                544       
                                                                 
 dense_53 (Dense)            (None, 4)                 132       
                                                                 
Total params: 1,460
Trainable params: 1,460
Non-trainable params: 0
_________________________________________________________________


 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [132]:
def one_state(env_state):
    state = np.array([np.array(env_state).flatten()])
    return state

In [134]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 10000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100)):
        total_reward = 0
        env_state = env.reset()
        
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #

        state = one_state(env_state)
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = one_state(next_state_env)

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        agent.update_epsilon_value()
        summary.append(total_reward)

    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

100%|██████████| 100/100 [00:13<00:00,  7.18it/s]


epoch #0	mean reward = 0.020	epsilon = 0.679


100%|██████████| 100/100 [00:17<00:00,  5.56it/s]


epoch #1	mean reward = 0.050	epsilon = 0.614


100%|██████████| 100/100 [00:19<00:00,  5.24it/s]


epoch #2	mean reward = 0.200	epsilon = 0.556


100%|██████████| 100/100 [00:18<00:00,  5.37it/s]


epoch #3	mean reward = 0.290	epsilon = 0.503


100%|██████████| 100/100 [00:18<00:00,  5.41it/s]


epoch #4	mean reward = 0.330	epsilon = 0.455


100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


epoch #5	mean reward = 0.400	epsilon = 0.411


100%|██████████| 100/100 [00:20<00:00,  4.82it/s]


epoch #6	mean reward = 0.440	epsilon = 0.372


100%|██████████| 100/100 [00:24<00:00,  4.00it/s]


epoch #7	mean reward = 0.410	epsilon = 0.337


100%|██████████| 100/100 [00:21<00:00,  4.75it/s]


epoch #8	mean reward = 0.560	epsilon = 0.305


100%|██████████| 100/100 [00:24<00:00,  4.06it/s]


epoch #9	mean reward = 0.610	epsilon = 0.276


100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


epoch #10	mean reward = 0.700	epsilon = 0.250


100%|██████████| 100/100 [00:24<00:00,  4.01it/s]


epoch #11	mean reward = 0.710	epsilon = 0.226


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


epoch #12	mean reward = 0.660	epsilon = 0.204


100%|██████████| 100/100 [00:25<00:00,  4.00it/s]


epoch #13	mean reward = 0.680	epsilon = 0.185


100%|██████████| 100/100 [00:24<00:00,  4.09it/s]


epoch #14	mean reward = 0.820	epsilon = 0.167


100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


epoch #15	mean reward = 0.820	epsilon = 0.151


100%|██████████| 100/100 [00:22<00:00,  4.54it/s]


epoch #16	mean reward = 0.800	epsilon = 0.137


100%|██████████| 100/100 [00:25<00:00,  3.86it/s]


epoch #17	mean reward = 0.830	epsilon = 0.124


100%|██████████| 100/100 [00:50<00:00,  1.96it/s]


epoch #18	mean reward = 0.810	epsilon = 0.112


100%|██████████| 100/100 [00:27<00:00,  3.58it/s]

epoch #19	mean reward = 0.920	epsilon = 0.101
You Win!





Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [153]:
env = gym.make("CartPole-v1").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = Sequential()
model.add(Dense(16, input_dim = state_size, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(action_size))
model.compile (loss ='mse', optimizer = Adam(learning_rate = learning_rate))
model.summary()

4
Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_57 (Dense)            (None, 16)                80        
                                                                 
 dense_58 (Dense)            (None, 32)                544       
                                                                 
 dense_59 (Dense)            (None, 2)                 66        
                                                                 
Total params: 690
Trainable params: 690
Non-trainable params: 0
_________________________________________________________________


Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [149]:
def one_state(env_state):
    state = np.array([np.array(env_state).flatten()])
    return state

In [155]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.5
done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100)):
        total_reward = 0
        env_state = env.reset()
        
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = one_state(env_state[0])
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = one_state(next_state_env)

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        agent.update_epsilon_value()
        summary.append(total_reward)

    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    

100%|██████████| 100/100 [01:08<00:00,  1.47it/s]


epoch #0	mean reward = 40.060	epsilon = 0.452


100%|██████████| 100/100 [01:08<00:00,  1.46it/s]


epoch #1	mean reward = 37.300	epsilon = 0.409


100%|██████████| 100/100 [01:55<00:00,  1.15s/it]


epoch #2	mean reward = 60.740	epsilon = 0.370


100%|██████████| 100/100 [01:53<00:00,  1.14s/it]


epoch #3	mean reward = 56.080	epsilon = 0.335


100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


epoch #4	mean reward = 72.090	epsilon = 0.303


100%|██████████| 100/100 [02:48<00:00,  1.69s/it]


epoch #5	mean reward = 76.150	epsilon = 0.274


100%|██████████| 100/100 [03:53<00:00,  2.34s/it]


epoch #6	mean reward = 104.010	epsilon = 0.248


100%|██████████| 100/100 [05:25<00:00,  3.25s/it]


epoch #7	mean reward = 141.560	epsilon = 0.225


100%|██████████| 100/100 [06:59<00:00,  4.20s/it]


epoch #8	mean reward = 174.130	epsilon = 0.203


100%|██████████| 100/100 [10:11<00:00,  6.12s/it]

epoch #9	mean reward = 239.990	epsilon = 0.184
You Win!



