In [13]:
import gym.envs.box2d
import numpy as np
from time import sleep
import random
from scipy import misc
import tensorflow as tf
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, TimeDistributed
from keras.models import Sequential
from keras.optimizers import Adam
from collections import deque

env = gym.make('CarRacing-v0')
env.reset()
print("Import location: " + gym.__file__)

Track generation: 984..1236 -> 252-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1202..1506 -> 304-tiles track
Import location: /Users/i502911/Desktop/Uni/SS 2019/Projektpraktikum ML/RL/gym/gym/__init__.py


In [14]:
########### Game info ###########
states = env.observation_space.shape
actions = env.action_space
random_action = env.action_space.sample()
print('No of params affecting the environment:', states)
print('No of possible actions:', actions)
print('Example for random action', random_action)
# [1, 0, 0] = Right % (steering)
# [-1, 0, 0] = Left % (steering)
# [0, 1, 0] = Straight % speed
# [0, 0, 1] = Brake % speed (calculated against speed)

No of params affecting the environment: (96, 96, 3)
No of possible actions: Box(3,)
Example for random action [-0.04641947  0.51251465  0.6927021 ]


In [18]:
############ Model #############
class DQNAgent:
    '''
    Set basic parameters for the model such as learning rate or decay
    '''
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate (will be discounted)
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
    
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        # --> Aktivierungsfunktionen:
        # softmax = summe der outputs wird 1
        # relu = größer null bis unendlich (max)
        # linear = linear einfach, also outputs zwischen -/+ undendlich
        # sigmoid = jeder output zwischen 0 und 1 --> wkeit
        # tanh = zwischen -1 und 1 = mit vorzeichen
        
        
        # Build the model layer by layer
        model = Sequential()
        
        # Convolutions
        model.add(Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size))
        model.add(MaxPool2D(pool_size=(2, 2)))
        model.add(Conv2D(128, kernel_size=9, activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 2)))
        # Output an der stelle sind 256 features mit jeweils 1/16 der ursprünglichen Bildgröße
        model.add(Conv2D(256, kernel_size=9, activation='relu'))
        model.add(MaxPool2D(pool_size=(2, 2)))
        
        # Connect convolution and dense layers
        # 3D -> 1D (Linearization)
        model.add(Flatten())
    
        
        # 3 hidden layers
        # This part is where the actual learning happens
        # 2 layers are sufficient to learn everything
        # Creates 9216 x 512 weights
        model.add(Dense(512, activation='relu'))
        # Creates 512 x 512 weights
        model.add(Dense(512, activation='relu'))
        
        # Output neurons (number of actions) (512 x 3)
        model.add(Dense(self.action_size, activation='sigmoid'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model
    
    '''
    Keep previous experiences and store them in a double ended queue
    '''
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    
    '''
    Make the agent perform an action through the predicted q value
    '''
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            exploration = env.action_space.sample()
            return exploration
        act_values = self.model.predict(state) 
        
        # pick best action for now
        best = np.argmax(act_values[0])
        for i in range(len(act_values[0])):
            if act_values[0][i] != best:
                act_values[0][i] = 0
        print(act_values[0])
        return act_values[0]
    
    '''
    Take in account the long term rewards through the discount rate
    --> experience replay
    '''
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            q_val = reward
            if not done:
                # Calculate discounted q value as reward + next best action
                q_val = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            # calculate predicted q value from nn
            q_val_pred = self.model.predict(state)
            print(q_val_pred)
            # Set the value for the action actually performed, leave the rest
            
            
            
            q_val_pred[0][] = q_val
            
            # train the model to map
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        
agent = DQNAgent(env.observation_space.shape, 3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 88, 88, 64)        15616     
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 44, 44, 64)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 36, 36, 128)       663680    
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 18, 18, 128)       0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 10, 10, 256)       2654464   
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 5, 5, 256)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 6400)              0         
__________

In [19]:
########### Params ###########
episodes=2

In [20]:
# TODO:
# FIRST: Continous control verstehen!!!!!!!!!! How is it done?
# erstmal das argmax nochmal ansehen... das sind die q werte! davon den höchsten
# Das mapping mit -1 bis 1 macht hier gar keinen sinn --> das muss später kommen
# Zum anfang kann man mal alles mit 0 oder 1 machen für die aktionen

for episode in range(episodes):
    score=0
    done=False
    state = env.reset()
    while not done:
        state = state.reshape(1,96,96,3)
        action = agent.act(state)
        observation, reward, done, info = env.step(action)
        observation = observation.reshape(1,96,96,3)
        agent.remember(state, action, reward, observation, done)
        state = observation
        env.render()
        score+=reward
    agent.replay(32)
    print("episode {} score {} exploration {}".format(episode, score, agent.epsilon))
env.close()

Track generation: 1203..1507 -> 304-tiles track
[-0.73639625  0.23152992  0.97178525]
[0.92509824 0.97623897 0.8261929 ]
[0.59548634 0.5484567  0.6968144 ]
[-0.21165805  0.42263645  0.7414501 ]
[-0.38628843  0.5315383   0.1947295 ]
[0.417094  0.5879197 0.8537963]
[0.22793052 0.36189893 0.2296378 ]
[0.73431253 0.9213298  0.5087916 ]
[-0.21147874  0.6772813   0.35253558]
[0.03894546 0.7559322  0.39993423]
[0.582524   0.6209645  0.15483877]
[-0.69339496  0.10909067  0.15784943]
[-0.2611022   0.24199358  0.41496536]
[-0.43552327  0.2754071   0.19145933]
[-0.54629713  0.03334794  0.97754973]
[0.6637595  0.20076947 0.8162376 ]
[0.39584738 0.79663044 0.18084604]
[0.90726626 0.5042841  0.48823252]
[0.329029  0.4088353 0.9671902]
[0.9266055  0.28195074 0.343051  ]
[-0.5022488   0.2726911   0.66500753]
[-0.21651164  0.7868959   0.19800021]
[-0.29798192  0.76106995  0.6968166 ]
[0.65943944 0.9864257  0.7149669 ]
[-0.21404104  0.2269348   0.6909141 ]
[-0.07021361  0.7679364   0.67737335]
[0.953352

[-0.6566732   0.45688826  0.3660679 ]
[-0.08062606  0.6903902   0.7821452 ]
[-0.28102478  0.109687    0.82530487]
[-0.48256972  0.44881406  0.32660276]
[0.10214187 0.6769764  0.80588496]
[-0.24062644  0.6537292   0.23646766]
[0.03048726 0.55997914 0.2503248 ]
[0.62517685 0.5089933  0.599822  ]
[-0.48411766  0.1173012   0.47175065]
[-0.11726054  0.47157708  0.18545929]
[0.43849415 0.47314197 0.10438096]
[-0.88298416  0.5163117   0.9727035 ]
[-0.28156134  0.39519477  0.2428201 ]
[0.9901244  0.877067   0.07369371]
[-0.09232562  0.7701189   0.5533648 ]
[0.7229948  0.00631993 0.4635965 ]
[-0.29240003  0.7837798   0.5760305 ]
[0.04608488 0.93041885 0.3197504 ]
[-0.07343214  0.12139596  0.9078526 ]
[-0.20594501  0.512622    0.00458734]
[-0.04046427  0.9936675   0.96436775]
[-0.8881941   0.24772894  0.6982556 ]
[-0.9437567   0.13232042  0.33568227]
[-0.9260109   0.50897384  0.016449  ]
[0.19936846 0.05650457 0.06188256]
[0.7995943  0.9518724  0.16124712]
[0.10968433 0.53448635 0.10977993]
[0.4

[-0.20536438  0.7832754   0.8059679 ]
[0.58632195 0.02862901 0.39326838]
[-0.77536494  0.502992    0.14771055]
[0.22061023 0.47801092 0.9058731 ]
[0.08662434 0.7924586  0.48498002]
[-0.38107312  0.40734854  0.21815012]
[0.47331595 0.17146276 0.16570595]
[0.15967195 0.97684056 0.5524065 ]
[0.13113986 0.49955526 0.37868735]
[-0.05039524  0.9743351   0.5975761 ]
[-0.8739768   0.719818    0.38480338]
[0.9029211 0.7411313 0.9220542]
[-0.19590522  0.5963565   0.42591783]
[-0.32153672  0.1118336   0.98615104]
[0.07546976 0.8901532  0.90534604]
[0.882238   0.63350487 0.81516385]
[0.00817705 0.24049285 0.5508408 ]
[-0.46495613  0.33407795  0.08665243]
[0.81565136 0.14540805 0.66688627]
[0.48482513 0.21020436 0.01429745]
[-0.89996326  0.25238886  0.51154035]
[-0.9044828   0.21255317  0.73396325]
[-0.37725484  0.00202367  0.91454387]
[-0.8339326   0.27490714  0.27369162]
[0.0051064  0.83222795 0.879757  ]
[0.00444305 0.8969093  0.7994726 ]
[0.83884656 0.52212375 0.04585272]
[-0.5657241   0.843282

[-0.9421008   0.41260862  0.0866695 ]
[0.5653621  0.29126516 0.34624913]
[-0.11144616  0.28989476  0.05272267]
[-0.95480764  0.8100265   0.51511335]
[0.06272356 0.5067271  0.39896944]
[0.7693162 0.9483982 0.9165289]
[-0.1947701  0.4775656  0.8223848]
[-0.76199275  0.54269636  0.20697655]
[-0.9939942   0.02780006  0.7283923 ]
[0.9342885  0.01229877 0.35725892]
[0.8069457  0.6259103  0.52326244]
[0.34569344 0.17857467 0.43664294]
[0.5494836  0.78477216 0.8736145 ]
[-0.21044168  0.06392827  0.26800954]
[-0.58479846  0.78828263  0.3874825 ]
[0.40260354 0.6407896  0.5095436 ]
[-0.81126934  0.7047999   0.08453303]
[0.83275944 0.94393176 0.11149073]
[0.13297902 0.49958417 0.7730929 ]
[0.03453239 0.17969017 0.47015646]
[-0.7950707  0.5303843  0.5668923]
[-0.3811474  0.9239244  0.6686392]
[-0.96248454  0.14167048  0.5486489 ]
[-0.19129978  0.48071867  0.04808436]
[-0.20646942  0.96606934  0.91967726]
[0.59040374 0.36814436 0.6014789 ]
[0.58971304 0.01551026 0.720219  ]
[-0.4157891   0.5459864  

[-0.8735503   0.02410835  0.4607649 ]
[-0.74215794  0.7771806   0.05429   ]
[-0.98369926  0.66236305  0.49723586]
[-0.37054995  0.53675395  0.29224592]
[0.8792274  0.92184824 0.16948359]
[-0.64340603  0.06969953  0.14349478]
[-0.15353744  0.49369434  0.59439474]
[0.45494908 0.63464975 0.98570734]
[0.02472196 0.4548795  0.23933204]
[0.90361995 0.48696756 0.82294315]
[-0.46546766  0.10109786  0.9969936 ]
[-0.01426805  0.38227448  0.01220594]
[-0.66052794  0.7796489   0.36848754]
[-0.17429897  0.20115493  0.06659408]
[0.10131372 0.6742514  0.3080211 ]
[0.29477203 0.39952102 0.35607526]
[-0.08993468  0.824385    0.9000714 ]
[0.4783493  0.435772   0.26891094]
[-0.5053564   0.45669544  0.6939193 ]
[-0.8399376  0.5553782  0.5725859]
[0.70793587 0.74340945 0.7124729 ]
[-0.07201163  0.58167875  0.48688945]
[-0.82240146  0.48903623  0.9750067 ]
[0.35489604 0.85230196 0.5335889 ]
[0.6468045  0.26277757 0.39581722]
[-0.03560633  0.87953424  0.11419737]
[-0.53181434  0.34242368  0.05703264]
[0.4500

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [12]:
########## Kill ###########
env.close()