In [1]:
import gym
import numpy as np
import keras.backend as K
import tensorflow as tf
import keras
import random
from keras.models import Model
from keras.initializers import RandomUniform, normal
from keras.layers import Dense, Conv1D, Conv2D, Flatten, Input, MaxPool2D, concatenate, merge
from keras.models import Sequential
from keras.optimizers import Adam
from collections import deque
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

Using TensorFlow backend.


Importing the racing game as environment for the Reinforcement Learning Model.

In [2]:
########### Game Setup ######### 
env = gym.make('CarRacing-v0')

The following code displays:
- states: 96 pixles x 96 pixles x RGB
- actions (3 possibilities):

[1, 0, 0] = Right % (steering)

[-1, 0, 0] = Left % (steering)

[0, 1, 0] = Straight % speed

[0, 0, 1] = Brake % speed (calculated against speed)

In [3]:
########### Game and Environment info ###########
states = env.observation_space.shape
actions = env.action_space
random_action = env.action_space.sample()
print('No of params affecting the environment:', states)
print('No of possible actions:', actions)
print('Example for random action', random_action)


########### Check if Keras uses GPU ##########
from tensorflow.python.client import device_lib
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
#print(device_lib.list_local_devices())
print("GPUs: ", K.tensorflow_backend._get_available_gpus())

No of params affecting the environment: (96, 96, 3)
No of possible actions: Box(3,)
Example for random action [ 0.34769237  0.4199608   0.64224565]
GPUs:  []


Based on: https://github.com/germain-hug/Deep-RL-Keras/tree/master/DDPG

In [4]:
class Actor:
    '''
    Set basic parameters for the model
    '''
    def __init__(self, state_size, action_size, learning_rate, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.tau = tau
        # Actual model
        self.model = self._build_model()
        # Target net
        self.target_model = self._build_model()
        self.adam_optimizer = self.optimizer()
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        
        state = Input((self.state_size))
        # Convolutions
        x = Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size)(state)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Conv2D(128, kernel_size=9, activation='relu') (x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        # 256 Features with around 1/16 of the initial picture size
        x = Conv2D(256, kernel_size=9, activation='relu') (x)
        x = MaxPool2D(pool_size=(2, 2)) (x)
        
        # Connect convolution and dense layers
        # 2D -> 1D (Linearization)
        x = Flatten()(x)
        
        # 3 hidden layers
        x = Dense(512, activation='relu')(x)
        # Creates 512 x 512 weights
        x = Dense(512, activation='relu')(x)
        
        # Defining the output for each dimension seperately
        # TODO: Using normal initialization to keep initial outputs near zero
        steering = Dense(1,activation='tanh',kernel_initializer=RandomUniform())(x)   
        acceleration = Dense(1,activation='sigmoid',kernel_initializer=RandomUniform())(x)   
        brake = Dense(1,activation='sigmoid',kernel_initializer=RandomUniform())(x) 
        out = concatenate([steering,acceleration,brake],axis=-1)
        
        model = Model(input=state,output=out)        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    def predict(self, state):
        """ Prediction of actor network
        """
        action = self.model.predict(np.expand_dims(state, axis=0))
        # Normalize the steering between -1 and 1
        # Only used if sigmoid function
        # action[0] = (action[0] * 2) - 1; 
        return action
    def target_predict(self, inp):
        """ Prediction of target network
        """
        return self.target_model.predict(inp)
    def transfer_weights(self):
        """ Transfer model weights to target model with a factor of Tau
        """
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.tau * W[i] + (1 - self.tau)* target_W[i]
        self.target_model.set_weights(target_W)
    def train(self, states, actions, grads):
        """ Actor Training
        """
        self.adam_optimizer([states, grads])
    def optimizer(self):
        """ Actor Optimizer
        """
        action_gdts = K.placeholder(shape=(None, self.action_size))
        params_grad = tf.gradients(self.model.output, self.model.trainable_weights, -action_gdts)
        grads = zip(params_grad, self.model.trainable_weights)
        return K.function([self.model.input, action_gdts], [tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)][1:])
    def save(self, path):
        self.model.save_weights(path + '_actor.h5')

    def load_weights(self, path):
        self.model.load_weights(path)
actor = Actor(env.observation_space.shape, 3, 0.001, 0.1)
actor.model.summary()

Instructions for updating:
Colocations handled automatically by placer.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 3)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 88, 88, 64)   15616       input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 44, 44, 64)   0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 36, 36, 128)  663680      max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
max_poolin

In [5]:
class Critic:
    '''
    Set basic parameters for the model
    '''
    def __init__(self, state_size, action_size, learning_rate, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.tau = tau
        # Actual model
        self.model = self._build_model()
        self.model.compile(Adam(self.learning_rate), 'mse')
        # Target net for stability
        self.target_model = self._build_model()        
        self.target_model.compile(Adam(self.learning_rate), 'mse')
        # Function to compute Q-value gradients (Actor Optimization)
        self.action_grads = K.function([self.model.input[0], self.model.input[1]], K.gradients(self.model.output, [self.model.input[1]]))
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        state = Input((self.state_size))
        x = Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size)(state)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Conv2D(128, kernel_size=9, activation='relu')(x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Conv2D(256, kernel_size=9, activation='relu')(x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        
        # Actions
        action_shape = (self.action_size,)
        action_layer = Input(shape=action_shape)
        
        # TODO: In the original paper the actions are merged in the second hidden layer
        x = concatenate([Flatten()(x), action_layer])
        x = Dense(512, activation='relu')(x)
        x = Dense(512, activation='relu')(x)
        out = Dense(1, activation='linear', kernel_initializer=RandomUniform())(x)
        return Model([state, action_layer], out)
    
    def gradients(self, states, actions):
        """ Compute Q-value gradients w.r.t. states and policy-actions
        """
        return self.action_grads([states, actions])
    
    def target_predict(self, inp):
        """ Prediction of target network
        """
        return self.target_model.predict(inp)
    # Why does the Critic have no predict function
    
    def train_on_batch(self, states, actions, critic_target):
        """ Train the critic network on batch of sampled experience
            using the keras function train_on_batch
        """
        return self.model.train_on_batch([states, actions], critic_target)
    
    def transfer_weights(self):
        """ Transfer model weights to target model with a factor of Tau
        """
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.tau * W[i] + (1 - self.tau)* target_W[i]
        self.target_model.set_weights(target_W)
    def save(self, path):
        self.model.save_weights(path + '_critic.h5')

    def load_weights(self, path):
        self.model.load_weights(path)
         
critic = Critic(env.observation_space.shape, 3, 0.001, 0.1)
critic.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 96, 96, 3)    0                                            
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 88, 88, 64)   15616       input_3[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_7 (MaxPooling2D)  (None, 44, 44, 64)   0           conv2d_7[0][0]                   
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 36, 36, 128)  663680      max_pooling2d_7[0][0]            
__________________________________________________________________________________________________
max_poolin

In [6]:
############## Utils
import numpy

""" Original Code by @jaara: https://github.com/jaara/AI-blog/blob/master/SumTree.py
"""

class SumTree:
    write = 0

    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = numpy.zeros( 2*capacity - 1 )
        self.data = numpy.zeros( capacity, dtype=object )

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])

In [7]:
############## Utils
from collections import deque

class MemoryBuffer(object):
    """ Memory Buffer Helper class for Experience Replay
    using a double-ended queue or a Sum Tree (for PER)
    """
    def __init__(self, buffer_size, with_per = False):
        """ Initialization
        """
        if(with_per):
            # Prioritized Experience Replay
            self.alpha = 0.5
            self.epsilon = 0.01
            self.buffer = SumTree(buffer_size)
        else:
            # Standard Buffer
            self.buffer = deque()
        self.count = 0
        self.with_per = with_per
        self.buffer_size = buffer_size

    def memorize(self, state, action, reward, done, new_state, error=None):
        """ Save an experience to memory, optionally with its TD-Error
        """

        experience = (state, action, reward, done, new_state)
        if(self.with_per):
            priority = self.priority(error[0])
            self.buffer.add(priority, experience)
            self.count += 1
        else:
            # Check if buffer is already full
            if self.count < self.buffer_size:
                self.buffer.append(experience)
                self.count += 1
            else:
                self.buffer.popleft()
                self.buffer.append(experience)

    def priority(self, error):
        """ Compute an experience priority, as per Schaul et al.
        """
        return (error + self.epsilon) ** self.alpha

    def size(self):
        """ Current Buffer Occupation
        """
        return self.count

    def sample_batch(self, batch_size):
        """ Sample a batch, optionally with (PER)
        """
        batch = []

        # Sample using prorities
        if(self.with_per):
            T = self.buffer.total() // batch_size
            for i in range(batch_size):
                a, b = T * i, T * (i + 1)
                s = random.uniform(a, b)
                idx, error, data = self.buffer.get(s)
                batch.append((*data, idx))
            idx = np.array([i[5] for i in batch])
        # Sample randomly from Buffer
        elif self.count < batch_size:
            idx = None
            batch = random.sample(self.buffer, self.count)
        else:
            idx = None
            batch = random.sample(self.buffer, batch_size)

        # Return a batch of experience
        s_batch = np.array([i[0] for i in batch])
        a_batch = np.array([i[1] for i in batch])
        r_batch = np.array([i[2] for i in batch])
        d_batch = np.array([i[3] for i in batch])
        new_s_batch = np.array([i[4] for i in batch])
        return s_batch, a_batch, r_batch, d_batch, new_s_batch, idx

    def update(self, idx, new_error):
        """ Update priority for idx (PER)
        """
        self.buffer.update(idx, self.priority(new_error))

    def clear(self):
        """ Clear buffer / Sum Tree
        """
        if(self.with_per): self.buffer = SumTree(buffer_size)
        else: self.buffer = deque()
        self.count = 0

In [8]:
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """

    def __init__(self, state_size, action_size, batch_no):
        """ Initialization
        """
        # Environment and A2C parameters
        self.state_size = (batch_no,) + state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.00005
        # Create actor and critic networks
        self.actor = Actor(state_size, self.action_size, 0.1 * self.learning_rate, 0.001)
        self.critic = Critic(state_size, self.action_size, self.learning_rate, 0.001)
        self.buffer = MemoryBuffer(20000)

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions, np.array(grads).reshape((-1, self.action_size)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def train(self, env, render, batch_size, nb_episodes):
        results = []

        # First, gather experience
        # tqdm_e = tqdm(range(nb_episodes), desc='Score', leave=True, unit=" episodes")
        f = open('results.txt', 'r+')
        f.truncate(0)
        f.close()
        
        for e in range(nb_episodes):
            with open("results.txt", "a") as myfile:
                myfile.write("#################### Episode: " + str(e))
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.action_size, theta=.15, mu=0., sigma=.3)

            while not done:
                if render:
                    env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                # !!!!!!!!TODO: Adjust act_range ! --> = Output range
                a = np.clip(a+noise.sample(), -1, 1)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Add outputs to memory buffer
                self.memorize(old_state, a, r, done, new_state)
                # Sample experience from buffer
                states, actions, rewards, dones, new_states, _ = self.sample_batch(batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict([new_states, self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state
                old_state = new_state
                cumul_reward += r
                if time % 100 == 0:
                    with open("results.txt", "a") as myfile:
                        myfile.write("{} | Action: {}, Reward: {}".format(time, a, cumul_reward))
                        myfile.write("\n")
                time += 1
            self.save_weights('')
            # Export results for Tensorboard
            # score = tfSummary('score', cumul_reward)
            # summary_writer.add_summary(score, global_step=e)
            # summary_writer.flush()
            # Display score
            # tqdm_e.set_description("Score: " + str(cumul_reward))
            # tqdm_e.refresh()
            print("Score: " + str(cumul_reward))

        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.learning_rate)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)

In [None]:
############## Training #################

ddpg = DDPG(env.observation_space.shape, 3, 10)

'''
for episode in range(2):
    score=0
    done=False
    state = env.reset()
    while not done:
        action = ddpg.policy_action(state)
        observation, reward, done, info = env.step(action)
        ddpg.memorize(observation, action, reward, done)
        env.render()
        score+=reward
        state = observation
    ddpg.train()
    print("episode {} score {}".format(episode, score))
'''
#ddpg.load_weights('_LR_5e-05_actor.h5', '_LR_5e-05_critic.h5')
ddpg.train(env, render=False, batch_size=32, nb_episodes=2000)
env.close()



Track generation: 1380..1729 -> 349-tiles track
Instructions for updating:
Use tf.cast instead.
Score: -39.65517241379405
Track generation: 1027..1288 -> 261-tiles track
Score: -57.69230769230816
Track generation: 1073..1345 -> 272-tiles track
Score: -70.47970479704817
Track generation: 1186..1486 -> 300-tiles track
Score: -63.210702341137996
Track generation: 1043..1308 -> 265-tiles track
Score: -28.030303030302942
Track generation: 1123..1408 -> 285-tiles track
Score: -61.2676056338034
Track generation: 1303..1633 -> 330-tiles track
Score: -36.170212765958055
Track generation: 1188..1489 -> 301-tiles track
Score: -10.000000000000771
Track generation: 1192..1494 -> 302-tiles track
Score: -23.58803986711032
Track generation: 1131..1418 -> 287-tiles track
Score: -58.041958041958836
Track generation: 1124..1412 -> 288-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1200..1504 -> 304-tiles track
Score: -80.19801980197987
Track generati

Based on: https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b

### Aktivierungsfunktionen:
softmax = summe der outputs wird 1 (w'keit)

relu = größer null bis unendlich (max)

linear = linear einfach, also outputs zwischen -/+ undendlich

sigmoid = jeder output zwischen 0 und 1 --> wkeit

tanh = zwischen -1 und 1 = mit vorzeichen

### Model-based / model-free:
--> Two approaches to learn the transition function & reward function

Model based: Agent exploits the environment to learn. Model = the environment learned from observations.

Model free: Agent relies on trail-and-error experience and doesn't learn the environment. E.g. estimating the optimal values of each action / state. (Q-Learning, actor critic) --> Here it cannot make predictions on the next state before taking an action!

### Value / policy learning:
_Policy based_: Give an output given a particular input. (Actor part of actor critic)

_Value based_: Assigns a score to the state by calculating the cumulative score for a state. Through a marcov decision process maximize the reward. Actions that result in a greater reward are better. (Q-Learning)


-> Advantage of policy based: (for us)
- continous action space
- stochastic policies (action X for 30%...)

-> Advantage of value based: (for us)
- simpler
- faster


--> We use both approaches together in the actor critic

### Sonstiges:
- Discount factor wird benutzt um zu verhindern dass der Reward unendlich wird


### Erklärung Actor-Critic / DDPG :
Deep-Q-Nets können nur für diskreten Input/Output verwendet werden, da ein Input auf die Q-Werte gemappt wird. Bei einem kontinuierlichen Space kann aber solches Mapping nicht stattfinden. 
Im erstes Schritt, um dieses Problem zu lösen, kann der Actor-Critic Ansatz verwendet werden. Hierbei wird das DQN aufgeteilt in einen actor, der die Aktionen ausführt und einen critic der die Aktionen + Zustände mit Q-Werten bewertet. Damit wird die Q-Wert-Logik vom kontinuierlichen Raum "abgekoppelt". Actor-Critic ist modell-free und value based und repräsentiert die policy als parametrische Wahrscheinlichkeitsverteilung. (Stochastisch)
Eine weitere Verbesserung ist der Deep Determninistic Policy Gradient (DDPG), da dieser eine bessere Performance als der stochastic actor critic ansatz (SAC) hat. Allerdings hemmt dieser ansatz die Exploration und daher muss ein off-policy Algorithmus (und noise) implementiert werden, der den state-action space exploriert. 


### Paper Zusammenfassung (DDPG)

#### General
- policy gradient funktioniert im kontinuierlichen weil er quasi eine Funktion ist (damit kontinuierlich)
- Das paper basiert auf diesem kontinuierlichen Modell und erweitert es durch ein DQN 
- DQN heißt so, weil es statt einer Tabelle (Actions/States) ein Neuronales Netzt verwendet, welches fähig ist hochdimensionalen Input wie z.B. Bilder zu mappen
- Hautproblem: Nur diskrete Action-Spaces bei DQN
- Eine Möglichkeit ist die Diskretisierung des Action Spaces, dies führt aber zu sehr vielen möglichen Aktionen > schlecht, weil schwer zu explorieren
- Warum wird ein DQN verwendet? Warum ist es ein Sinnvoller Approximator?
1. Off policy (nach dem Training) with replay buffer (gegen correlations in den aktionen)
2. Network is trained with target Q network + batch normalization
- Für die implementierung braucht man: actor-critic architektur, lernalgorithmus mit so wenig Anpassungsmöglichkeiten wie möglich

#### Background
- Der agent führt eine policy aus, welche Zustände zu eienr Wahrscheinlichkeitsverteilung für Aktionen mappt
- Das Ganze is als Markov Decision Process modelliert (aktueller Zustand hängt nur vom vorherigen Zustand ab) und es gibt Übergangswahrscheinlichkeiten, die aber bei model-free nicht bekannt sind. 
- return of a state = sum of discounted future reward
- goal = learn policy with maximizing expected reward
- Dafür wird die Bellman equation verwendet
- Der Agent muss sich nicht im Environment befinden um zu lernen, sondern es kann nach der Interaktion gelernt werden (es reicht die Reaktionen des Environments zu kennen) --> Daher Q-Learning (off-policy, greedy -> maximaler Q-Wert)
- Q_Learning hat ein seperates target network um die Zielwerte zu berechnen --> Dieses verbessert die Stabilität enorm!
ggf.: https://medium.com/@jonathan_hui/rl-dqn-deep-q-network-e207751f7ae4
- Loss minimiert den Fehler aus tatsächlichem Q-Wert und vorhergesagtem Q Wert durch die Anpassung des Q-Netzes bzw. dessen Gewichte

#### Algorihmus
- Actor netz (function), dass die aktuelle policy beinhaltet
-  µ(s||θ_µ) --> gegeben dem State und der Policy function (=Actor Netz) wird die policy ausgeführt
- Critic netz (DQN), verwendet Bellmann Gleichung
- WICHTIG: Actor wird über Kettenregel upgedated. --> Gradient ascent --> Der Actor will "aufsteigen" / zum maximum der Q-Werte durch anpassen seiner policy d.h. seines Netzes / gewichte 
= Entspricht dem Policy Gradient
- Die Umformungen sind vermutlich wegen der Kettenregel
- Batch learning wird verwendet um die Stabilität zu erhöhen (replay buffer) --> Minibatches resetten die policy nicht und ermöglichen zusätzlich independent & identical distributed samples
- Replay buffer:
1. Es ist ein "Speicher" von vorherigen State-Action-Reward-pairs
2. Sample alle tuples (s, a, r, s+1)
3. Alte Samples werden "discarded" --> Queue in Python
4. Nach jeder "Episode" wird actor und critic geupdated durch einen minibatch aus random tuples

- Target net wird benutzt, da das Critic-Net anfällig ist unstabil zu sein (weil die ziel-Q-werte ebenfalls auf der bellmann gleichung basieren)
- Es verwendet daher ein target network mit "soften" updates (statt die Gewichte zu kopieren) --> Mehr Literatur nachlesen
- Dafür wird eine Kopie der beiden Netze (actor, critic) erstellt und verwendet um die target-werte zu berechnen. Diese Werden geupdated (langsam) 
- Das "tau" bestimmt die Stärke der updates --> In die target netze werden die neuen Netzen zu einem bestimmten prozentsatz tau * Netz miteinbezogen. Target_netz = Tatsächliches_Netz * Tau + Target_netz * (1- Tau)
- Dadurch wird das ganze quasi supervised, da man target-werte als Zielwerte hat und die eigenen werte als predictions
- Die target-netze hinken dann zwar hinterher, aber da das Problem dadurch stabil wird lohnt es sich
- EVENTUELL FÜR CARLA RELEVANT --> Batch normaization um die features zu skalieren, sodass es generisch für die Inputs ist
- Normalisiert jede Dimension in einem Minibatch mit mean/variance
- Verwendet einen running average of the mean and variance
- GGF. später ansehen??? --> Braucht man nur um in verschiedenen Environmens zu lernen
- Wichtig: exploration bei Kontinuierlichen Action Spaces ist schwierig, da es unendlich viele möglichkeiten gibt...Hierbei wird bei DDPG unabhängig vom lern-algorithmus exploriert. Exploration is on-policy und learning off-policy
- Hierzu wird Noise (Ornstein-Uhlenbeck) hinzugefügt um den kontinuierlichen Raum zu erkunden.
- 


In [None]:
############ Model #############
class Actor:
    '''
    Set basic parameters for the model
    '''
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.model = self._build_model()
    
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        # Build the model layer by layer
        model = Sequential()
        
        # Convolutions
        model.add(Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size))
        model.add(MaxPool2D(pool_size=(2, 2)))
        model.add(Conv2D(128, kernel_size=9, activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 2)))
        # Output an der stelle sind 256 features mit jeweils 1/16 der ursprünglichen Bildgröße
        model.add(Conv2D(256, kernel_size=9, activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 2)))
        
        # Connect convolution and dense layers
        # 2D -> 1D (Linearization)
        model.add(Flatten())
    
        # 3 hidden layers
        # This part is where the actual learning happens
        # 2 layers are sufficient to learn everything
        # Creates 9216 x 512 weights
        model.add(Dense(512, activation='relu'))
        # Creates 512 x 512 weights
        model.add(Dense(512, activation='relu'))
        
        # Output neurons (number of actions) (512 x 3)
        model.add(Dense(self.action_size, activation='sigmoid'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model
    
class Critic:
    '''
    Set basic parameters for the model
    '''
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.model = self._build_model()
    
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        # Build the model layer by layer
        model = Sequential()
        
        # Convolutions
        model.add(Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size))
        model.add(MaxPool2D(pool_size=(2, 2)))
        model.add(Conv2D(128, kernel_size=9, activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 2)))
        # Output an der stelle sind 256 features mit jeweils 1/16 der ursprünglichen Bildgröße
        model.add(Conv2D(256, kernel_size=9, activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 2)))
        
        # Connect convolution and dense layers
        # 3D -> 1D (Linearization)
        model.add(Flatten())
    
        # 3 hidden layers
        # This part is where the actual learning happens
        # 2 layers are sufficient to learn everything
        # Creates 9216 x 512 weights
        model.add(Dense(512, activation='relu'))
        # Creates 512 x 512 weights
        model.add(Dense(512, activation='relu'))
        
        # Output neurons (number of actions) (512 x 3)
        model.add(Dense(self.action_size, activation='sigmoid'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model
    
        
actor = Actor(env.observation_space.shape, 3)
critic = Critic(env.observation_space.shape, 3)


In [None]:
########### Params ###########
episodes=2

In [None]:
# TODO:
# FIRST: Continous control verstehen!!!!!!!!!! How is it done?
# erstmal das argmax nochmal ansehen... das sind die q werte! davon den höchsten
# Das mapping mit -1 bis 1 macht hier gar keinen sinn --> das muss später kommen
# Zum anfang kann man mal alles mit 0 oder 1 machen für die aktionen

for episode in range(episodes):
    score=0
    done=False
    state = env.reset()
    while not done:
        state = state.reshape(1,96,96,3)
        action = agent.act(state)
        observation, reward, done, info = env.step(action)
        observation = observation.reshape(1,96,96,3)
        agent.remember(state, action, reward, observation, done)
        state = observation
        env.render()
        score+=reward
    agent.replay(32)
    print("episode {} score {} exploration {}".format(episode, score, agent.epsilon))
env.close()

In [None]:
########## Kill ###########
env.close()