In [1]:
import numpy as np
import tensorflow as tf

Packman Deep Q network

In [2]:
class PackmanDQN:
    def __init__(self, session, inputShape, outputShape, name):
        self._sess = session
        self._inputShape = list(inputShape)
        self._outputShape = outputShape
        self._name = name
        self.BuildNetwork()
    
    def BuildNetwork(self):
        with tf.variable_scope(self._name, reuse=tf.AUTO_REUSE):
            self._X = tf.placeholder(tf.float32, shape=[None, 210,160,3])
            
            #X shape = [None, 210, 160, 3]
            W1 = tf.get_variable(name="_Weight1", shape=[3, 3, 3, 64], initializer=tf.glorot_normal_initializer())
            L1 = tf.nn.conv2d(self._X, W1, strides=[1, 1, 1, 1], padding="SAME")
            L1 = tf.nn.relu(L1)
            L1 = tf.nn.max_pool(L1, ksize=[1, 2, 2, 1], strides=[1,4,4,1], padding="SAME")
            #shape reduces to [105, 80, 96]
            
            W2 = tf.get_variable(name="_Weight2", shape=[3, 3, 64, 128], initializer=tf.glorot_normal_initializer())
            L2 = tf.nn.conv2d(L1, W2, strides=[1, 1, 1, 1], padding="SAME")
            L2 = tf.nn.relu(L2)
            L2 = tf.nn.max_pool(L2, ksize=[1, 2, 2, 1], strides=[1,2,2,1], padding="SAME")
            #L2 = tf.layers.batch_normalization(L2)
            #shape reduces to [53, 40, 192]
            
            #W3 = tf.get_variable(name="filter3", shape=[3, 3, 192, 384], initializer=tf.glorot_normal_initializer())
            #L3 = tf.nn.conv2d(L2, W3, strides=[1,1,1,1], padding="SAME")
            #L3 = tf.nn.relu(L3)
            #L3 = tf.nn.max_pool(L3, ksize=[1, 2, 2, 1], strides=[1,2,2,1], padding="SAME")
            #L3 = tf.nn.dropout(L3, keep_prob=self._keep_prob)
            #L3 = tf.layers.batch_normalization(L3)
            #shape reduces to [27, 20, 256]
            
            W4 = tf.get_variable(name="_Weight4", shape=[3, 3, 128, 256], initializer=tf.glorot_normal_initializer())
            L4 = tf.nn.conv2d(L2, W4, strides=[1,1,1,1], padding="SAME")
            L4 = tf.nn.relu(L4)
            L4 = tf.nn.max_pool(L4, ksize=[1, 2, 2, 1], strides=[1,2,2,1], padding="SAME")
            #L4 = tf.layers.batch_normalization(L4)
            #shape reduces to [14, 10, 384]
            
            W5 = tf.get_variable(name="_Weight5", shape=[3, 3, 256, 256], initializer=tf.glorot_normal_initializer())
            L5 = tf.nn.conv2d(L4, W5, strides=[1,1,1,1], padding="SAME")
            L5 = tf.nn.relu(L5)
            L5 = tf.nn.max_pool(L5, ksize=[1, 2, 2, 1], strides=[1,2,2,1], padding="SAME")
            #L5 = tf.layers.batch_normalization(L5)
            #shape reduces to [7, 5, 384]
            L5 = tf.reshape(L5, [-1, 7*5*256])
            
            W6 = tf.get_variable(name="fc1", shape=[7*5*256, 256], initializer=tf.glorot_normal_initializer())
            b6 = tf.Variable(tf.random_normal(shape=[256], stddev=0.005), name="b6")
            H6 = tf.nn.relu(tf.matmul(L5, W6) + b6)
            
            #W7 = tf.get_variable(name="fc2", shape=[1280, 128], initializer=tf.glorot_normal_initializer())
            #b7 = tf.Variable(tf.random_normal(shape=[128], stddev=0.005), name="b7")
            #H7 = tf.nn.relu(tf.matmul(H6, W7) + b7)
            
            W8 = tf.get_variable(name="fc3", shape=[256, outputShape], initializer=tf.glorot_normal_initializer())
            b8 = tf.Variable(tf.random_normal(shape=[outputShape], stddev=0.005), name="b8")
            self._hypothesis = tf.matmul(H6, W8) + b8
            
            self._Y = tf.placeholder(tf.float32, shape=[None, outputShape])
            self._loss = tf.reduce_mean(tf.square(self._hypothesis - self._Y))
            self._train = tf.train.AdamOptimizer(0.001).minimize(self._loss)
        
    def predict(self, state, keep_prob):
        x = np.reshape(state, [1, 210, 160, 3])
        return self._sess.run(self._hypothesis, feed_dict={self._X: x})
    
    def update(self, x_stack, y_stack, keep_prob):
        return self._sess.run([self._loss, self._train], feed_dict={self._X: x_stack, self._Y: y_stack})
            

In [3]:
import gym
import matplotlib.pyplot as plt
import random
import tensorflow as tf
import time
from collections import deque

env = gym.make('MsPacman-v0')
inputShape = env.observation_space.shape
outputShape = env.action_space.n

maxEpisode= 100
dis = 0.9
replayList = deque()
replayMaxSize = 20000

def GetCopyVarOperation(predNetworkName="pred", targetNetworkName="target"):
    copyOperation =[]
    predVars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "pred")
    targetVars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "target")
    
    for predvar, targetvar in zip(predVars, targetVars):
        copyOperation.append(targetvar.assign(predvar))
    return copyOperation

def Replay(predNet, targetNet, batch):
    x_stack = np.empty(0).reshape(0, 210, 160, 3)
    y_stack = np.empty(0).reshape(0, outputShape)
    
    for state, action, reward, done, nextState in batch:
        #Q is [[4]] shaped array. Q[0, action] ==> nextState
        Q = predNet.predict(state)

        if done: Q[0, action] = reward
        else: Q[0, action] = reward + dis * np.max(targetNet.predict(nextState))

        x_stack = np.vstack([x_stack, [state]])
        y_stack = np.vstack([y_stack, Q])

    return predNet.update(x_stack, y_stack)

def BotPlay(predNet):
    state = env.reset()
    reward_sum = 0
    while True:
        env.render()
        action = np.argmax(predNet.predict(state))
        state, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            print("Total Score: {}".format(reward_sum))
            break
    env.close()
    return 0

In [4]:
with tf.Session() as sess:
    predNet = PackmanDQN(sess, inputShape, outputShape, "pred")
    targetNet = PackmanDQN(sess, inputShape, outputShape, "target")

    save_file = './train_model.ckpt'
    saver = tf.train.Saver()
    saver.restore(sess, save_file)
    BotPlay(predNet)

INFO:tensorflow:Restoring parameters from ./train_model.ckpt


TypeError: predict() missing 1 required positional argument: 'keep_prob'

with tf.Session() as sess:
    predNet = PackmanDQN(sess, inputShape, outputShape, "pred")
    targetNet = PackmanDQN(sess, in putShape, outputShape, "target")
    copyOperation = GetCopyVarOperation("pred", "target")
    random.seed(time.time())
    
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    save_file = './train_model.ckpt'
    saver = tf.train.Saver()
    saver.restore(sess, save_file)
    reward_avg=0
    for episode in range(1, maxEpisode):
        
        e = 1. / ((episode / 10) + 1)
        state = env.reset()
        done = False
        rewardSum = 0
        
        while not done:
            if random.random() < e:
                action = random.randint(0, 8)
            else:
                action = np.argmax(predNet.predict(state, 0.5))
                
            nextState, reward, done, info = env.step(action)
            if done: reward = -100
                
            replayList.append((state, action, reward, done, nextState))
            if len(replayList) > replayMaxSize:
                replayList.popleft()
            
            rewardSum += reward
            reward_avg += reward/10
            state = nextState
            if rewardSum>5000: break
        
        print('Episode: {}, Reward: {}'.format(episode, rewardSum))
        
        if episode % 10 == 0:
            loss_avg=0
            batch = random.sample(replayList, 10)
            for i in range(50):
                loss, _ = Replay(predNet, targetNet, batch)
                loss_avg += loss / 50
            sess.run(copyOperation)
            print('Loss: {}, Average Reward: {}'.format(loss_avg, reward_avg))
            reward_avg=0
    
    saver.save(sess, save_file)
    BotPlay(predNet)