In [1]:
from keras.utils import plot_model
import threading
import time
# INITIALIZATION: libraries, parameters, network...

from keras.models import Sequential      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque            # For storing moves 
import World
import numpy as np
import gym                                # To train our network

import tensorflow as tf
import random

Using TensorFlow backend.


In [2]:
# For sampling batches from the observations


    # Parameters
D = deque()                                # Register where the actions will be stored

observetime = 500                          # Number of timesteps we will be acting on the game and observing results
epsilon = 0.9                              # Probability of doing a random move
gamma = 0.9                                # Discounted future reward. How much we care about steps further in time
mb_size = 60     

class Q_Network():
    def __init__(self,scope):
        with tf.variable_scope(scope):
                    self.input_p = tf.placeholder(tf.float32, [None,5])
                    self.q_target = tf.placeholder(tf.float32, [None ,4], name='Q_target')
                    self.w1 = tf.get_variable('w1', [5,15], initializer=tf.random_uniform_initializer())
                    self.b1 = tf.get_variable('b1', [1,15], initializer=tf.zeros_initializer())
                    self.l1 = tf.nn.relu(tf.matmul(self.input_p,self.w1 ) + self.b1)

                    self.w2 = tf.get_variable('w2', [15, 15], initializer=tf.random_uniform_initializer())
                    self.b2 = tf.get_variable('b2', [1, 15], initializer=tf.zeros_initializer())
                    self.l2 = tf.nn.relu(tf.matmul(self.l1,self.w2) + self.b2)

                    self.w3 = tf.get_variable('w3', [15,4], initializer=tf.random_uniform_initializer())
                    self.b3 = tf.get_variable('b3', [1,4], initializer=tf.zeros_initializer())
                    self.Q = (tf.matmul(self.l2,self.w3) + self.b3)
                    
                    self.sqd=tf.squared_difference(self.q_target, self.Q)
                    self.loss = tf.reduce_mean(self.sqd)       
                    self.trainer = tf.train.RMSPropOptimizer(learning_rate=0.01)
                    self.updateModel = self.trainer.minimize(self.loss)        
# model = Sequential()
# model.add(Dense(10, input_shape=(5,) , init='uniform', activation='relu'))
#     #model.add(Flatten())       # Flatten input so as to have no problems with processing
# model.add(Dense(10, init='uniform', activation='relu'))
# model.add(Dense(4, init='uniform', activation='linear'))    # Same number of outputs as possible actions

# model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
# plot_model(model, to_file='model.png',show_shapes=True)
                          # Learning minibatch size

In [3]:
def do_action(action):
    #r = -World.score
    if action == 0:
        rew,s,s1,res=World.try_move(0, -1)
    elif action == 1:
        rew,s,s1,res=World.try_move(0, 1)
    elif action == 2:
        rew,s,s1,res=World.try_move(-1, 0)
    elif action == 3:
        rew,s,s1,res=World.try_move(1, 0)
    #r += World.score
    return rew,s,s1,res

In [4]:
def updateNetwork(target,main):
    a1=tf.assign(target.w1,main.w1.value())
    a2=tf.assign(target.w2,main.w2.value())
    a3=tf.assign(target.w3,main.w3.value())
    a4=tf.assign(target.b1,main.b1.value())
    a5=tf.assign(target.b2,main.b2.value())
    a6=tf.assign(target.b3,main.b3.value())
    return a1,a2,a3,a4,a5,a6
     
                
            

In [5]:
# FIRST STEP: Knowing what each action does (Observing)
def observe(sess,main):
    done = False
    state=World.findState()
    for t in range(observetime):
        if np.random.rand() <= epsilon:
            action = np.random.randint(0, 4)
        else:
            Qs = sess.run(main.Q,feed_dict={main.input_p:state})          # Q-values predictions
            action = np.argmax(Qs,axis=1)[0]             # Move with highest Q-value is the chosen one
        reward,state,state_new,res = do_action(action)     # See state of the game, reward... after performing the action
                 # Update the input with the new state of the game
        D.append((state, action, reward, state_new, res))
            
    print('Observing Finished')

In [6]:
# SECOND STEP: Learning from the observations (Experience replay)

                            # Sample some moves

def learn(sess,main,target):
    inputs = np.zeros((mb_size,5))
    targets = np.zeros((mb_size, 4))
    for j in range(5):
        minibatch = random.sample(D, mb_size) 
        for i in range(mb_size):
            state = minibatch[i][0]
            action = minibatch[i][1]
            reward = minibatch[i][2]
            state_new = minibatch[i][3]
            done = minibatch[i][4]

    # Build Bellman equation for the Q function
            inputs[i:i+1] = state   # Q-values predictions

            targets[i] = sess.run(target.Q,feed_dict={target.input_p:state})
            Q_sa = sess.run(main.Q,feed_dict={main.input_p:state_new})
            if done:
                targets[i, action] = reward
            else:
                targets[i, action] = reward + gamma * np.max(Q_sa,axis=1)
                
        a,b=sess.run([main.updateModel,main.loss],feed_dict={main.input_p:inputs,main.q_target:targets})
        print("Cost",b)
    sess.run(updateNetwork(target,main))    
    # Train network to output the Q function
    D.clear()
       
    print('Learning Finished')
    #print(inputs,targets)

In [7]:
# THIRD STEP: Play!
def test(sess,main):
    print("Evaluating")
    World.restart=True
    s1 = World.findState()
    done = False
    tot_reward = 0.0
    i=0
    while i<50:                 # Uncomment to see game running
        Qs = sess.run(main.Q,feed_dict={main.input_p:s1})
        action = np.argmax(Qs,axis=1)   
        #print(Q,action)
        rew,s,s1,res = do_action(action[0]) 
        tot_reward += rew
        print('Game : ',action,"   reward ",rew,"state",s,res)
        i+=1
        time.sleep(0.1)
        if res:
            break
    print('Game ended! Total reward: {}'.format(tot_reward))
    

In [8]:

class MyThread(threading.Thread):
    def run(self):
        process()   



In [None]:
f=False

def process():
    global epsilon,f
    main=Q_Network('main')
    target=Q_Network('target')
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(200):
            print('iteration ',i)
            World.restart=True
            observe(sess,main)
            learn(sess,main,target)
            if(i>150):
                World.start()
                test(sess,main)
                World.pause()
            epsilon -=0.03                              
t = MyThread()
t.daemon = True
t.start() 
World.start_game()

iteration  0
Observing Finished
Cost 407599.0
Cost 219143.0
Cost 137906.0
Cost 111686.0
Cost 143702.0
Learning Finished
iteration  1
Observing Finished
Cost 99055.0
Cost 116457.0
Cost 107038.0
Cost 92052.3
Cost 88811.2
Learning Finished
iteration  2
Observing Finished
Cost 66272.7
Cost 61279.2
Cost 84009.4
Cost 60226.7
Cost 54187.9
Learning Finished
iteration  3
Observing Finished
Cost 66751.4
Cost 45115.5
Cost 41751.8
Cost 44139.9
Cost 47581.3
Learning Finished
iteration  4
Observing Finished
Cost 14162.9
Cost 18517.6
Cost 9760.52
Cost 12850.8
Cost 12962.5
Learning Finished
iteration  5
Observing Finished
Cost 11533.7
Cost 10038.2
Cost 14393.2
Cost 21502.9
Cost 4903.22
Learning Finished
iteration  6
Observing Finished
Cost 63255.3
Cost 45288.6
Cost 57191.0
Cost 47831.8
Cost 41647.1
Learning Finished
iteration  7
Observing Finished
Cost 3822.04
Cost 3654.53
Cost 7883.15
Cost 9150.72
Cost 10758.6
Learning Finished
iteration  8
Observing Finished
Cost 5751.94
Cost 8972.07
Cost 7096.93
Co

Cost 0.161771
Cost 0.156204
Cost 0.258248
Cost 0.201683
Learning Finished
iteration  71
Observing Finished
Cost 0.245521
Cost 1.98394
Cost 1.42755
Cost 0.623491
Cost 0.14991
Learning Finished
iteration  72
Observing Finished
Cost 0.716162
Cost 0.347286
Cost 0.558043
Cost 0.402241
Cost 0.312993
Learning Finished
iteration  73
Observing Finished
Cost 0.0593878
Cost 0.0452531
Cost 0.0462186
Cost 0.103009
Cost 0.0763346
Learning Finished
iteration  74
Observing Finished
Cost 0.145296
Cost 0.133501
Cost 0.147513
Cost 0.215885
Cost 0.0601748
Learning Finished
iteration  75
Observing Finished
Cost 0.0383993
Cost 0.0612665
Cost 0.0498725
Cost 0.046352
Cost 0.041644
Learning Finished
iteration  76
Observing Finished
Cost 0.49547
Cost 0.330442
Cost 0.220879
Cost 0.476109
Cost 0.280521
Learning Finished
iteration  77
Observing Finished
Cost 0.0475894
Cost 0.177196
Cost 0.705874
Cost 0.555167
Cost 0.0711306
Learning Finished
iteration  78
Observing Finished
Cost 0.248173
Cost 0.285051
Cost 0.38840

Observing Finished
Cost 0.00568861
Cost 0.00175428
Cost 0.00461938
Cost 0.00208364
Cost 0.00432722
Learning Finished
iteration  136
Observing Finished
Cost 0.00239628
Cost 0.00220492
Cost 0.00114745
Cost 0.00341174
Cost 0.00160894
Learning Finished
iteration  137
Observing Finished
Cost 0.00198261
Cost 0.00347403
Cost 0.00125597
Cost 0.00155201
Cost 0.000932391
Learning Finished
iteration  138
Observing Finished
Cost 0.00652445
Cost 0.00836217
Cost 0.0122734
Cost 0.00582136
Cost 0.00438099
Learning Finished
iteration  139
Observing Finished
Cost 0.000985911
Cost 0.00276409
Cost 0.00197578
Cost 0.00318425
Cost 0.0011563
Learning Finished
iteration  140
Observing Finished
Cost 0.00161045
Cost 0.00285883
Cost 0.00114683
Cost 0.00118638
Cost 0.000598911
Learning Finished
iteration  141
Observing Finished
Cost 0.00105761
Cost 0.00194293
Cost 0.00260661
Cost 0.00218008
Cost 0.00436027
Learning Finished
iteration  142
Observing Finished
Cost 0.00477716
Cost 0.0035369
Cost 0.0695761
Cost 0.006

Game :  [0]    reward  0.02 state [[  0.   3.   2.  11.   3.]] False
Game :  [0]    reward  0.02 state [[  0.   2.   0.  12.   2.]] False
Game :  [0]    reward  0.02 state [[  0.   1.   0.  13.   1.]] False
Game :  [3]    reward  0.02 state [[  0.   0.  14.  14.   0.]] False
Game :  [3]    reward  0.02 state [[  1.  -3.  13.  14.   1.]] False
Game :  [3]    reward  0.02 state [[  2.  -3.  12.  14.   2.]] False
Game :  [3]    reward  0.02 state [[  3.  -4.  11.  14.   3.]] False
Game :  [3]    reward  0.02 state [[  4.  -7.  10.  14.   4.]] False
Game :  [3]    reward  0.02 state [[  5.   0.   9.  14.   5.]] False
Game :  [3]    reward  0.02 state [[  6.  -4.   8.  14.   6.]] False
Game :  [3]    reward  0.02 state [[  7.  -8.   7.  14.   7.]] False
Game :  [3]    reward  0.02 state [[  8.  -9.   6.  14.   8.]] False
Game :  [3]    reward  0.02 state [[  9.   0.   5.  14.   9.]] False
Game :  [3]    reward  0.02 state [[ 10.   0.   4.  14.  10.]] False
Game :  [3]    reward  0.02 state 

Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state 

Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state [[  0.   1.   0.  13.   1.]] False
Game :  [0]    reward  -0.2 state [[  0.   2.   0.  12.   2.]] False
Game :  [1]    reward  -0.2 state 

Game :  [2]    reward  -0.2 state [[  4.          -2.          10.           9.           6.40312433]] False
Game ended! Total reward: -7.140000000000004
iteration  166
Observing Finished
Cost 0.00500366
Cost 0.00548376
Cost 0.00540324
Cost 0.00915789
Cost 0.0079665
Learning Finished
Evaluating
Game :  [1]    reward  -0.5 state [[  0.  14.  14.   0.  14.]] True
Game ended! Total reward: -0.5
iteration  167
Observing Finished
Cost 0.00552004
Cost 0.00775737
Cost 0.00788034
Cost 0.0044261
Cost 0.00325415
Learning Finished
Evaluating
Game :  [3]    reward  0.02 state [[  0.  14.  14.   0.  14.]] False
Game :  [0]    reward  0.02 state [[  1.          11.          13.           0.          14.03566837]] False
Game :  [0]    reward  0.02 state [[  1.          10.          13.           1.          13.03840446]] False
Game :  [0]    reward  0.02 state [[  1.           9.          13.           2.          12.04159451]] False
Game :  [0]    reward  0.02 state [[  1.           8.          13. 

Game :  [3]    reward  -0.2 state [[  4.           4.          10.           3.          11.70469952]] False
Game :  [2]    reward  -0.2 state [[  5.          11.           9.           3.          12.08304596]] False
Game :  [3]    reward  -0.2 state [[  4.           4.          10.           3.          11.70469952]] False
Game :  [2]    reward  -0.2 state [[  5.          11.           9.           3.          12.08304596]] False
Game :  [3]    reward  -0.2 state [[  4.           4.          10.           3.          11.70469952]] False
Game :  [2]    reward  -0.2 state [[  5.          11.           9.           3.          12.08304596]] False
Game :  [3]    reward  -0.2 state [[  4.           4.          10.           3.          11.70469952]] False
Game :  [2]    reward  -0.2 state [[  5.          11.           9.           3.          12.08304596]] False
Game :  [3]    reward  -0.2 state [[  4.           4.          10.           3.          11.70469952]] False
Game :  [2]    rewa

Game :  [3]    reward  0.02 state [[  5.  12.   9.   2.  13.]] False
Game :  [3]    reward  0.02 state [[  6.           8.           8.           2.          13.41640759]] False
Game :  [3]    reward  0.02 state [[  7.           4.           7.           2.          13.89244366]] False
Game :  [1]    reward  0.02 state [[  8.           3.           6.           2.          14.42220497]] False
Game :  [3]    reward  0.02 state [[  8.           4.           6.           1.          15.26433754]] False
Game :  [1]    reward  0.02 state [[  9.          13.           5.           1.          15.81138802]] False
Game :  [3]    reward  0.02 state [[  9.          14.           5.           0.          16.64331627]] False
Game :  [3]    reward  0.02 state [[ 10.          14.           4.           0.          17.20465088]] False
Game :  [3]    reward  0.02 state [[ 11.          14.           3.           0.          17.80449295]] False
Game :  [1]    reward  -0.5 state [[ 12.          14.      

Game :  [3]    reward  0.02 state [[  8.           4.           6.           1.          15.26433754]] False
Game :  [3]    reward  0.02 state [[  9.          13.           5.           1.          15.81138802]] False
Game :  [3]    reward  0.02 state [[ 10.          13.           4.           1.          16.40122032]] False
Game :  [3]    reward  0.02 state [[ 11.          13.           3.           1.          17.02938652]] False
Game :  [3]    reward  0.02 state [[ 12.          13.           2.           1.          17.69180679]] False
Game :  [3]    reward  0.02 state [[ 13.          13.           1.           1.          18.38477707]] False
Game :  [3]    reward  -0.5 state [[ 14.          13.           0.           1.          19.10497284]] True
Game ended! Total reward: -0.2
iteration  178
Observing Finished
Cost 0.00151327
Cost 0.00213129
Cost 0.00114687
Cost 0.000967368
Cost 0.00208707
Learning Finished
Evaluating
Game :  [3]    reward  0.02 state [[  0.  14.  14.   0.  14.]] 

Game :  [3]    reward  -0.5 state [[ 14.           9.           0.           5.          16.64331627]] True
Game ended! Total reward: -0.11999999999999994
iteration  182
Observing Finished
Cost 0.000901372
Cost 0.000541675
Cost 0.00120113
Cost 0.000872807
Cost 0.00109617
Learning Finished
Evaluating
Game :  [3]    reward  0.02 state [[  0.  14.  14.   0.  14.]] False
Game :  [3]    reward  0.02 state [[  1.          11.          13.           0.          14.03566837]] False
Game :  [3]    reward  0.02 state [[  2.          11.          12.           0.          14.14213562]] False
Game :  [3]    reward  0.02 state [[  3.         10.         11.          0.         14.3178215]] False
Game :  [3]    reward  0.02 state [[  4.           7.          10.           0.          14.56021976]] False
Game :  [3]    reward  0.02 state [[  5.          14.           9.           0.          14.86606884]] False
Game :  [3]    reward  0.02 state [[  6.         10.          8.          0.         15.23

Game :  [3]    reward  0.02 state [[  9.          14.           5.           0.          16.64331627]] False
Game :  [3]    reward  0.02 state [[ 10.          14.           4.           0.          17.20465088]] False
Game :  [3]    reward  0.02 state [[ 11.          14.           3.           0.          17.80449295]] False
Game :  [3]    reward  0.02 state [[ 12.          14.           2.           0.          18.43908882]] False
Game :  [0]    reward  0.02 state [[ 13.          14.           1.           0.          19.10497284]] False
Game :  [0]    reward  0.02 state [[ 13.          13.           1.           1.          18.38477707]] False
Game :  [0]    reward  0.02 state [[ 13.          12.           1.           2.          17.69180679]] False
Game :  [1]    reward  -0.2 state [[ 13.          11.           1.           3.          17.02938652]] False
Game :  [0]    reward  -0.2 state [[ 13.          12.           1.           2.          17.69180679]] False
Game :  [1]    rewa

Game ended! Total reward: -0.22000000000000003
iteration  190
Observing Finished
Cost 0.000199361
Cost 0.00028427
Cost 0.000275129
Cost 0.000399083
Cost 0.000224732
Learning Finished
Evaluating
Game :  [3]    reward  0.02 state [[  0.  14.  14.   0.  14.]] False
Game :  [3]    reward  0.02 state [[  1.          11.          13.           0.          14.03566837]] False
Game :  [3]    reward  0.02 state [[  2.          11.          12.           0.          14.14213562]] False
Game :  [3]    reward  0.02 state [[  3.         10.         11.          0.         14.3178215]] False
Game :  [3]    reward  0.02 state [[  4.           7.          10.           0.          14.56021976]] False
Game :  [3]    reward  0.02 state [[  5.          14.           9.           0.          14.86606884]] False
Game :  [3]    reward  0.02 state [[  6.         10.          8.          0.         15.2315464]] False
Game :  [3]    reward  0.02 state [[  7.           6.           7.           0.          15.6

Game :  [3]    reward  -0.2 state [[  8.           0.           6.           5.          12.04159451]] False
Game :  [2]    reward  -0.2 state [[  9.           9.           5.           5.          12.72792244]] False
Game :  [3]    reward  -0.2 state [[  8.           0.           6.           5.          12.04159451]] False
Game :  [2]    reward  -0.2 state [[  9.           9.           5.           5.          12.72792244]] False
Game :  [3]    reward  -0.2 state [[  8.           0.           6.           5.          12.04159451]] False
Game :  [2]    reward  -0.2 state [[  9.           9.           5.           5.          12.72792244]] False
Game :  [3]    reward  -0.2 state [[  8.           0.           6.           5.          12.04159451]] False
Game :  [2]    reward  -0.2 state [[  9.           9.           5.           5.          12.72792244]] False
Game :  [3]    reward  -0.2 state [[  8.           0.           6.           5.          12.04159451]] False
Game :  [2]    rewa

Game :  [3]    reward  0.02 state [[  0.  13.  14.   1.  13.]] False
Game :  [3]    reward  0.02 state [[  1.          10.          13.           1.          13.03840446]] False
Game :  [3]    reward  0.02 state [[  2.          10.          12.           1.          13.15294647]] False
Game :  [3]    reward  0.02 state [[  3.           9.          11.           1.          13.34166431]] False
Game :  [3]    reward  0.02 state [[  4.           6.          10.           1.          13.60147095]] False
Game :  [3]    reward  0.02 state [[  5.         13.          9.          1.         13.9283886]] False
Game :  [0]    reward  0.02 state [[  6.          9.          8.          1.         14.3178215]] False
Game :  [0]    reward  0.02 state [[  6.           8.           8.           2.          13.41640759]] False
Game :  [0]    reward  0.02 state [[  6.           7.           8.           3.          12.52996445]] False
Game :  [0]    reward  0.02 state [[  6.           6.           8.   

Game :  [3]    reward  0.02 state [[ 11.          14.           3.           0.          17.80449295]] False
Game :  [3]    reward  0.02 state [[ 12.          14.           2.           0.          18.43908882]] False
Game :  [3]    reward  0.02 state [[ 13.          14.           1.           0.          19.10497284]] False
Game :  [3]    reward  -0.5 state [[ 14.          14.           0.           0.          19.79899025]] True
Game ended! Total reward: -0.22000000000000003
iteration  199
Observing Finished
Cost 4.70252e-05
Cost 4.02989e-05
Cost 6.27112e-05
Cost 3.38383e-05
Cost 5.55564e-05
Learning Finished
Evaluating
Game :  [3]    reward  0.02 state [[  0.  14.  14.   0.  14.]] False
Game :  [3]    reward  0.02 state [[  1.          11.          13.           0.          14.03566837]] False
Game :  [3]    reward  0.02 state [[  2.          11.          12.           0.          14.14213562]] False
Game :  [3]    reward  0.02 state [[  3.         10.         11.          0.       