In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import os, sys
import random
import pickle
import argparse
import logging
import tensorflow as tf

np.random.seed(15)
tf.random.set_seed(15)
random.seed(15)

np.set_printoptions(precision=3, threshold=np.inf)


In [None]:
class Maze(object):
    WALL = 2
    EMPTY = 8
    LEFT = 0
    RIGHT = 1 # right or forward
    BONUS = 1000
    def __init__(self, width, length): 
        self.length = length
        self.width = width
        self.maze = np.ones((self.width, self.length)) * Maze.WALL

        self.generate_maze()
        
        #set self.maze_mask
        #self.shortest_solutions
        self.get_shortest_solutions()
        
        #self.longest_shortest, used to calculate objective value
        self.get_longest_shortest_solutions()
        
        # used to normalize objective value
        self.best_score = self.get_attainable_score()

        #initialize the agent position in the maze
        self.reset()
        
        
        
    
    def generate_maze(self):
        # generate walls, doors
        
        spaces = np.random.randint(low=1, high=4, size=self.length)
        cum_spaces = np.cumsum(spaces) # leave the first col empty
 
        for ind, val in enumerate(cum_spaces):
            if val >= self.length-1:
                self.wall_position = cum_spaces[:ind]
                break
        if self.wall_position[0] > 1:
            self.wall_position[0] = 1
        if self.wall_position[-1] < self.length-1:
            self.wall_position = np.append(self.wall_position, self.length-1)
                
        self.road_position = np.array([]).astype(np.int)
        for ind in np.arange(self.length-1):
            if ind not in self.wall_position:
                self.road_position = np.append(self.road_position, ind)
        
        for i in self.road_position:
            self.maze[1:-1,i]=Maze.EMPTY
        
        self.door_position = np.random.randint(low=1, high=self.width-1, size=len(self.wall_position))
        #print(self.door_position)
    
        # get door position
        self.door_position = np.zeros(len(self.wall_position), dtype = np.int)
        self.door_position[-1] = np.random.randint(low=1, high=self.width-1) #1~self.width-2 available door position
        for ind in np.arange(len(self.wall_position)-2, -1, -1):
            if self.wall_position[ind] == self.wall_position[ind+1] -1: # two walls together
                self.door_position[ind] = self.door_position[ind+1]
                
            else:
                self.door_position[ind] = np.random.randint(low=1, high=self.width-1)
        
        # Fill door cue
        self.maze[ self.door_position[-1], self.wall_position[-1] ] = Maze.RIGHT # default last door due
        for i in np.arange(len(self.wall_position)-2, -1, -1):
            if self.door_position[i+1] < self.door_position[i]:
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.LEFT
            else: 
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.RIGHT
                
                
                
       
                
    def print_maze(self, x=-1, y=-1):
        if x>=0 and y>=0:
            tmp = self.maze[x,y]
            self.maze[x,y] = -1 # position of the agent
            
        print("  ", end="")    
        #for i in np.arange(self.length):
        #    print('%d ' % i, end='')
        print("\n")
        
        for j in np.arange(self.width):
            print('%d ' % j, end='')
            for i in np.arange(self.length):
            
                if self.maze[j,i]==Maze.WALL: # wall position
                    print('H ',end='')
                elif self.maze[j,i]==Maze.EMPTY:
                    print('  ',end='')# road
                elif self.maze[j,i]==-1:
                    print('T ',end='')
                    self.maze[x,y]= tmp
                else:
                    print('%d ' % self.maze[j,i], end='')
            print('\n')

        
    def get_shortest_solutions(self):
        # get the shortest length to the end of maze from each layer
        
        self.maze_mask = np.zeros(self.length, dtype=np.int)
        for ind, val in enumerate(self.wall_position):
            self.maze_mask[val] = self.door_position[ind]
       
        self.shortest_solutions = np.zeros(self.length, dtype=np.int)
        step = 0
        next_wall = self.length-1
        for ind in np.arange(self.length-2, -1, -1):
            if self.maze_mask[ind] == 0: # road
                step += 1
                self.shortest_solutions[ind] = step
            else: # wall
                step += np.abs(self.maze_mask[next_wall] - self.maze_mask[ind])+1 #1 out the door, +diff for vert.
                self.shortest_solutions[ind] = step
                next_wall = ind
        

    
    def get_distance_escape(self, x, y):
        # get the shortest distance to escape from the current position
        vertical_distance = 0
        if y in self.road_position:
            for next_wall_ind in np.arange(y+1, y+4, 1):
                if next_wall_ind in self.wall_position: break
            vertical_distance = np.abs(self.maze_mask[next_wall_ind] - x)
        return self.shortest_solutions[y]+vertical_distance
                

        
    def get_longest_shortest_solutions(self):
        # get the shortest length from corner of starting to the end out maze
        left = self.get_distance_escape(1,0)
        right = self.get_distance_escape(self.width-2,0)
        
        self.longest_shortest = np.maximum(left, right)+5 # higher than true value
    
    
    def get_attainable_score(self):
        position = []
        x = self.door_position[0] # in front of the first door
        y = 0
        position.append([x,y])
        
        score = np.float32(0)
        door_signal=self.maze[self.door_position[0], 1]
        r=[]
        self.steps = 0
        
        while True:
            pass_wall = False
            self.steps += 1
            if self.maze[x, y+1]!=Maze.WALL: # road
                y += 1
                pass_wall=True
                if y in self.wall_position:
                    door_signal = self.maze[x,y]
            else: # wall
                if door_signal == 0 and self.maze[x-1,y]==Maze.WALL: # init location make door signal no more signal
                    door_signal = 1
                if door_signal == 1 and self.maze[x+1,y]==Maze.WALL:
                    door_signal = 0
                x += int(door_signal*2-1)
                
            position.append([x,y])
            r.append((self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest-1+int(pass_wall))
            score += (self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest-1+int(pass_wall)
            if y == self.length-1:
                r[-1] += Maze.BONUS
                score += Maze.BONUS
                break
        
        #print(position)
        self.average_reward = np.mean(r)
     
        return score
        
    """
    def get_attainable_score(self):
        position = []
        x = self.door_position[0] # in front of the first door
        y = 0
        score = np.float32(0)
        pass_maze = 0
        door_signal=self.maze[self.door_position[0], 1]
        r=[]
        for _ in np.arange(300, -1, -1):
            position.append([x,y])
            if y != self.length-1:
                r.append((self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze)
                score += (self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze
            if self.maze[x, y+1]!=Maze.WALL: # road
                y += 1
                if y in self.wall_position:
                    door_signal = self.maze[x,y]
                if y == self.length-1:
                    pass_maze += 1
                    y=0
            else: # wall
                if door_signal == 0 and self.maze[x-1,y]==Maze.WALL: # init location make door signal no more signal
                    door_signal = 1
                if door_signal == 1 and self.maze[x+1,y]==Maze.WALL:
                    door_signal = 0
                x += int(door_signal*2-1)
        
        #print(position)
        self.average_reward = np.mean(r)
     
        return score
    """
    
    def reset(self):
        self.score = 0 
        
        self.position = np.array([self.door_position[-1], 0]) # in front of the last door
        self.trajectory = []
        self.trajectory.append(self.position)
        
        
        x, y = self.position
        observation = self.perception()
        
        return observation
        
    def perception(self):
        x, y = self.position
        observation = np.zeros(6)
        
        if self.maze[x,y+1] == Maze.WALL:
            observation[0]=1
        else: observation[0]=0
        
        if self.maze[x-1,y+1] == Maze.WALL:
            observation[1]=1
        else: observation[1]=0
        
        if self.maze[x+1,y+1] == Maze.WALL:
            observation[2] = 1
        else: observation[2]=0
        
        if self.maze[x-1,y] == Maze.WALL:
            observation[4]=1
        else: observation[4]=0
        
        if self.maze[x+1,y] == Maze.WALL:
            observation[5]=1
        else: observation[5]=0
        
        if y in self.wall_position:
            observation[3] = self.maze[x, y]
            
        return observation
            
    def step(self, action):
        
        x, y = self.position

        crash_wall = False
        pass_wall = False
        if action == 0: #down == 1 and up == 0:
            if self.maze[x+1,y]==Maze.WALL:
                crash_wall = True
 
            if  self.maze[x+1,y] != Maze.WALL:
                self.position = x+1, y
                self.trajectory.append(self.position)

        elif action == 1: #down == 0 and up == 1:
            if self.maze[x-1,y] == Maze.WALL:
                crash_wall = True
            
            if  self.maze[x-1,y] != Maze.WALL:
                self.position = x-1, y
                self.trajectory.append(self.position)


        elif action == 2: #down == 1 and up == 1 or down == 0 and up == 0:
        
            if self.maze[x,y+1] != Maze.WALL:
                pass_wall = True
                self.position = x,y+1
                self.trajectory.append(self.position)
            else:
                crash_wall = True
                
        #elif down == 0 and up == 0:
        #    self.position = x, y
        #    self.trajectory.append(self.position)
            
        
        x,y = self.position
        reward = (self.longest_shortest - self.get_distance_escape(x,y))/self.longest_shortest -1 

        reward += int(pass_wall) - int(crash_wall)

        self.score += reward    
        fitness = self.get_fitness()
        
  
        
        if y == self.length-1:# at the end of the maze 
            done = True
            observation_ = np.ones(6)
            reward += Maze.BONUS # the final reward should be larger than sum of small reward on the way
            self.score += Maze.BONUS
            fitness = self.get_fitness()
        else:
            done = False
            observation_ = self.perception()


        return observation_, reward, done
    
    
    
    def get_fitness(self):
        
        return self.score#/self.best_score 
    


In [None]:
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 14 10:38:25 2018

@author: lenovo
"""
import numpy as np

def sigmoid(x):
    return 1. /(1 + np.exp(-x))
    
def stable_sigmoid1(x):
    import math
    return math.exp(-np.logaddexp(0,-x))

def stable_sigmoid2(x):
    if x >= 0:
        return 1. /(1 + np.exp(-x))
    else:
        z = np.exp(x)
        return z /(1+z)

def sigmoid_derivative (value):
    return value * (1-value)

def tanh_derivative(value):
    return 1. - value ** 2

def random_init(a, b, *args):
    np.random.seed(0)
    return np.random.rand(*args) * (b - a) + a

class LSTMParam:
    
    def __init__(self, hidden_size, x_dim):
        self.hidden_size = hidden_size
        self.x_dim = x_dim
        concate_dim = hidden_size + x_dim
        # init network parameters
        self.Wg = random_init(-0.1, 0.1, hidden_size, concate_dim)
        self.Wi = random_init(-0.1, 0.1, hidden_size, concate_dim)
        self.Wf = random_init(-0.1, 0.1, hidden_size, concate_dim)
        self.Wo = random_init(-0.1, 0.1, hidden_size, concate_dim)
        self.bg = random_init(-0.1, 0.1, hidden_size)
        self.bi = random_init(-0.1, 0.1, hidden_size)
        self.bf = random_init(-0.1, 0.1, hidden_size)
        self.bo = random_init(-0.1, 0.1, hidden_size)
        # init network derivatives
        self.Wg_diff = np.zeros((hidden_size, concate_dim)) 
        self.Wi_diff = np.zeros((hidden_size, concate_dim)) 
        self.Wf_diff = np.zeros((hidden_size, concate_dim)) 
        self.Wo_diff = np.zeros((hidden_size, concate_dim)) 
        self.bg_diff = np.zeros(hidden_size) 
        self.bi_diff = np.zeros(hidden_size) 
        self.bf_diff = np.zeros(hidden_size) 
        self.bo_diff = np.zeros(hidden_size)
    
#        self.Wg_diff = np.zeros_like(self.Wg)
#        self.Wi_diff = np.zeros_like(self.Wi)
#        self.Wf_diff = np.zeros_like(self.Wf)
#        self.Wo_diff = np.zeros_like(self.Wo)
#        self.bg_diff = np.zeros_like(self.bg)
#        self.bi_diff = np.zeros_like(self.bi)
#        self.bf_diff = np.zeros_like(self.bf)
#        self.bo_diff = np.zeros_like(self.bo)
        
        
    def apply_diff(self, sample_cnt, lr = 1,):
        self.Wg -= self.Wg_diff * lr / sample_cnt
        self.Wi -= self.Wi_diff * lr / sample_cnt
        self.Wf -= self.Wf_diff * lr / sample_cnt
        self.Wo -= self.Wo_diff * lr / sample_cnt
        self.bg -= self.bg_diff * lr / sample_cnt
        self.bi -= self.bi_diff * lr / sample_cnt
        self.bf -= self.bf_diff * lr / sample_cnt
        self.bo -= self.bo_diff * lr / sample_cnt
        # re-set the derivatives as zeros
        self.Wg_diff = np.zeros_like(self.Wg)
        self.Wi_diff = np.zeros_like(self.Wi)
        self.Wf_diff = np.zeros_like(self.Wf)
        self.Wo_diff = np.zeros_like(self.Wo)
        self.bg_diff = np.zeros_like(self.bg)
        self.bi_diff = np.zeros_like(self.bi)
        self.bf_diff = np.zeros_like(self.bf)
        self.bo_diff = np.zeros_like(self.bo)
        

        
        
class LSTMstate:
    def __init__(self, hidden_size):
        self.g = np.zeros(hidden_size)
        self.i = np.zeros(hidden_size)
        self.f = np.zeros(hidden_size)
        self.o = np.zeros(hidden_size)
        self.s = np.zeros(hidden_size)
        self.h = np.zeros(hidden_size)
        
        self.diff_h = np.zeros_like(self.h)
        self.diff_s = np.zeros_like(self.s)
        
class LSTMnode:
    
    def __init__(self, LSTMParam, LSTMstate):
        self.Param = LSTMParam
        self.state = LSTMstate
        self.concate_x = None
    
    
    def forward(self, x, h_prev = None, s_prev = None):
        if s_prev is None: s_prev = np.zeros_like(self.state.s)
        if h_prev is None: h_prev = np.zeros_like(self.state.h)
        
        self.s_prev = s_prev
        self.h_prev = h_prev
        
        """ it should be the last hidden state """
        #print(x)
        #print(h_prev)
        concate_x = np.hstack((x, h_prev))
        """ it should be the sigmoid function for the gates """
        self.state.g = np.tanh(np.dot(self.Param.Wg, concate_x) + self.Param.bg)
        self.state.i = sigmoid(np.dot(self.Param.Wi, concate_x) + self.Param.bi)
        self.state.f = sigmoid(np.dot(self.Param.Wf, concate_x) + self.Param.bf)
        self.state.o = sigmoid(np.dot(self.Param.Wo, concate_x) + self.Param.bo)
        self.state.s = self.state.f * self.s_prev + self.state.i * self.state.g 
        self.state.h = self.state.s * self.state.o
        self.concate_x = concate_x
           
    
    def BPTT(self, diff_h, diff_s):
        # there intermediate results not strored in LSTMnode
        ds = self.state.o * diff_h + diff_s
        do = self.state.s * diff_h
        di = self.state.g * ds
        dg = self.state.i * ds 
        df = self.s_prev * ds
        
        # derivative go through the activation function
        di_input = sigmoid_derivative(self.state.i) * di
        df_input = sigmoid_derivative(self.state.f) * df
        do_input = sigmoid_derivative(self.state.o) * do
        dg_input = tanh_derivative(self.state.g) * dg
        
        # derivative of parameters, *property of Param.
        self.Param.Wg_diff += np.outer(dg_input, self.concate_x)
        self.Param.Wi_diff += np.outer(di_input, self.concate_x)
        self.Param.Wf_diff += np.outer(df_input, self.concate_x)
        self.Param.Wo_diff += np.outer(do_input, self.concate_x)
        self.Param.bg_diff += dg_input;
        self.Param.bi_diff += di_input;
        self.Param.bf_diff += df_input;
        self.Param.bo_diff += do_input;
        
        # derivative of last hidden state, * used before defined
        diff_concate_x = np.zeros_like(self.concate_x)
        diff_concate_x += np.dot(self.Param.Wg.T, dg_input)
        diff_concate_x += np.dot(self.Param.Wi.T, di_input)
        diff_concate_x += np.dot(self.Param.Wf.T, df_input)
        diff_concate_x += np.dot(self.Param.Wo.T, do_input)
        
        self.state.diff_h = diff_concate_x[self.Param.x_dim:]
        self.state.diff_s = ds * self.state.f
         
        
class LSTMnetwork:
    
    def __init__(self, Param):
        self.n_output = 3
        self.Param = Param
        self.x_list = []
        self.node_list = []
    
    def x_list_clear(self):
        self.x_list = []
        self.node_list = []
        
    def predict(self, x):
        self.x_list.append(x)
        if len(self.x_list) > len(self.node_list):
            Lstmstate = LSTMstate(self.Param.hidden_size)
            self.node_list.append(LSTMnode(self.Param, Lstmstate))
            
        idx = len(self.x_list)-1
        if idx == 0:
            self.node_list[idx].forward(self.x_list[idx])
        else:
            s_prev = self.node_list[idx-1].state.s
            h_prev = self.node_list[idx-1].state.h
            self.node_list[idx].forward(self.x_list[idx],s_prev,h_prev)
     
         
    def get_loss(self, y_list, loss_layer):
        
        assert len(self.x_list) == len(y_list)
        idx = len(self.x_list)-1
        pred = self.node_list[idx].state.h
        loss = loss_layer.loss(pred, y_list[idx])
        diff_h = loss_layer.diff(pred, y_list[idx])
        
        """ it should be zeros not zeros_like """
        diff_s = np.zeros(self.Param.hidden_size)
        self.node_list[idx].BPTT(diff_h, diff_s)
        
        idx -= 1
        
        while idx >= 0:
            loss += loss_layer.loss(self.node_list[idx].state.h, y_list[idx])
            diff_h = loss_layer.diff(self.node_list[idx].state.h, y_list[idx])
            diff_h += self.node_list[idx+1].state.diff_h
            diff_s = self.node_list[idx+1].state.diff_s
            self.node_list[idx].BPTT(diff_h, diff_s)
            idx -= 1
            
            
        return loss


In [None]:
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 14 22:25:14 2018

@author: Zhichen
2018/3/16 linear regression layer parameters are too large, solved by dividing the number of time points.
RNN don't need to devide sample number, but linear regression must.
"""

import numpy as np
import matplotlib.pyplot as plt

#from mylstm import LSTMParam, LSTMnetwork



class linear_regression_layer:
    """the functions are all @classmethod"""
    """ self is a must """
    @classmethod
    def random_init(self, a, b, *args): 
        np.random.seed(0)
        return np.random.rand(*args) * (b - a) + a
    
    """ self is needed in calling the function, otherwise the dim is wrong
    classmethod don't need self parameter"""
    @classmethod
    def __init__(self, out_dim, in_dim):
        self.W = self.random_init( -0.1, 0.1, in_dim) 
        self.b = self.random_init( -0.1, 0.1, out_dim) 
        
        self.diff_W = np.zeros_like(self.W)
        self.diff_b = np.zeros_like(self.b)
        
        
    @classmethod
    def loss(self, inputs, label):
        self.output = np.dot(self.W, inputs) + self.b 
        loss = (self.output - label) ** 2 #+ 0.5 * np.sum(list(map(lambda x: x **2, self.W)))
        #loss = np.sum(list(map(lambda x: (x[0] - x[1]) ** 2, zip(list(self.output), label))))
        return loss
    
    @classmethod
    def diff(self, pred, label):
        diff_input = np.zeros_like(pred) 
        d_input = 2 * (self.output - label)
        #d_input = 2 * list(map(lambda x: x[0] - x[1], zip(self.output, label)))
        diff_input = self.W * d_input
        
        self.diff_W += d_input * pred  #+ self.W
        self.diff_b += np.sum(d_input)
        
        
        return diff_input
    
    @classmethod
    def output_layer_diff(self, sample_cnt, lr = 1):
        self.W -= lr * self.diff_W / sample_cnt
        self.b -= lr * self.diff_b / sample_cnt
       
        self.diff_W = np.zeros_like(self.W)
        self.diff_b = np.zeros_like(self.b)

        
        
def example_LR():
    
    np.random.seed(0)
    
    x_dim = 50
    y_dim = 1
    hidden_size = 50

    Lstmparam = LSTMParam(hidden_size, x_dim)
    Lstmnet = LSTMnetwork(Lstmparam)
    linear_regression_layer(y_dim, hidden_size)
    
    #generate a dataset (a sequence)
    y_list = [-0.5, 0.2, 0.1, -0.5]
    x_list = [np.random.random(x_dim) for _ in y_list]
    # the data is the same, check the derivative
    for iter_epoch in range(100):
        print("iter %2s" % iter_epoch, end=": ")
        for ind in range(len(y_list)):
            Lstmnet.predict(x_list[ind])
            
        print("y_pred = [" +
              ", ".join(["% 2.5f" % Lstmnet.node_list[ind].state.h[0] for ind in range(len(y_list))]) +
              "]", end=", ")
            
        loss = Lstmnet.get_loss(y_list, linear_regression_layer)
        print("loss:", "%.3e" % loss)
    
        Lstmparam.apply_diff(len(y_list), lr = 0.1)
        linear_regression_layer.output_layer_diff(len(y_list), lr = 0.1)
        Lstmnet.x_list_clear()
        
    plt.figure()
    plt.plot(range(num_epoch), Epoch_loss)
    plt.xlabel('Training Epoches')
    plt.ylabel('Loss')
    plt.title('LSTM Training Loss')
    
    plt.show()

    
    
    
class Euclidean_loss_layer(object):
    # class method, otherwise, parameter missing
    def __init__(self, n_output):
        self.n_output = n_output
    
    def loss(self, pred, label):

        return np.sum((pred[:self.n_output] - label) **2)
    
    #def loss(self, predict, label):
    #    return list(map(lambda x: (x[o] - x[1])**2, zip(predict, label)))
#    @classmethod
#    def diff(self, predict, label): #* wrong derivative
#        derivative = np.zeros_like(predict)
#        derivative[0] = 2 * (predict[0] -label)
#        return derivative

    def diff(self, pred, label):
        diff = np.zeros_like(pred)
        diff[:self.n_output] = 2 * (pred[:self.n_output] - label)
        return diff


def example():
    
    np.random.seed(0)
    
    x_dim = 10
    hidden_size = 5

    Lstmparam = LSTMParam(hidden_size, x_dim)
    Lstmnet = LSTMnetwork(Lstmparam)
    loss_layer = Euclidean_loss_layer(n_output = 1)
    
    y_list = [-0.5, 0.2, 0.1, -0.5]
    x_list = [np.random.random(x_dim) for _ in y_list]
    # the data is the same, check the derivative
    
    eps_loss = []
    num_epoch = 100
    for iter_epoch in range(num_epoch):
        print("iter", "%2s" % str(iter_epoch), end=": ")
        for ind in range(len(y_list)):
            Lstmnet.predict(x_list[ind])
            
        print("y_pred = [" +
              ", ".join(["% 2.5f" % Lstmnet.node_list[ind].state.h[0] for ind in range(len(y_list))]) +
              "]", end=", ")
            
        loss = Lstmnet.get_loss(y_list, loss_layer)
        print("loss:", "%.3e" % loss)
        eps_loss.append(loss)
    
        Lstmparam.apply_diff(len(y_list),lr = 0.1)
        Lstmnet.x_list_clear()

    plt.figure()
    plt.plot(range(num_epoch), eps_loss)
    plt.xlabel('Training Epoches')
    plt.ylabel('Loss')
    plt.title('LSTM Training Loss')
    plt.show()
    
#if __name__ == "__main__":
    #example()
    

Begin DQN here

In [None]:
import numpy as np
import tensorflow as tf
import random

class CalculateEpsilon:
    def __init__(self, init_eps, decay_rate=0.99999):
        self.init_eps = init_eps
        self.decay_rate = decay_rate

    def __call__(self):
        return_eps = self.init_eps
        self.init_eps *= self.decay_rate
        return return_eps
    
    
"""======================================================================="""


def prepare_batch(experience_replay, batch_size):
    random.shuffle(experience_replay)
    return [experience_replay[i] for i in range(batch_size)]

def prepare_batch_seq(experience_replay):
    return experience_replay

def preprocess_observation(observation):
    """Just A Reminder"""
    return observation


"""======================================================================="""

def run_agent_greedy_eps(agent, latest_obs, epsilon, environment, debug=False):
    """
    Running DQN Agent Based on Greedy Epsilon exploration Scheme

    Args:
        latest_obs -- Latest Observation, which will be fed to the network
            directly.
        epsilon -- The current epsilon.
        environment -- The current environment.
        debug -- If True, Show the predicted Q of the Deep Q Agent.

    Returns:
        action_int -- action in integer format, for executing in the environment
        action -- one hot representation of actions.
        Q_value -- The predicted Q-Value
    """

    obs = latest_obs #np.expand_dims(latest_obs, axis=0)
    Q_value = agent.call(obs)[:agent.n_output]
    if debug:
        print(Q_value)

    if random.random() < epsilon:
        action_int = np.random.choice(len(Q_value))
    else:
        max_index = np.argwhere(Q_value == np.max(Q_value)).flatten().tolist()
        action_int = np.random.choice(max_index)
        #action_int = int(np.argmax(Q_value.numpy(), axis=1)[0])

    action = np.zeros((1, len(Q_value)))
    action[:, action_int] = 1
    return action_int, action, Q_value


def calculate_training_loss(agent_mirror, target_mirror, current_obs, actions, next_obs, rewards, is_done_mask, gamma):
    """
    Calculate the training loss

    Args:
        agent -- the current agent we want to train.
        target_agent -- the target Q-Network
        current_obs -- Tensor of the current observation
        actions -- Tensor of choosen actions
        next_obs -- Tensor of observation after taking action
        rewards -- Tensor of reward after taking the action
        is_done_mask -- Telling whether this leads to the end of eps
        gamma -- Discount Reward

    Returns:
        loss -- training loss
    """

    agent = copy.deepcopy(agent_mirror)
    agent.model.x_list_clear()
    target_agent = copy.deepcopy(target_mirror)
    target_agent.model.x_list_clear()
    
    T = len(current_obs)
    rpt_loss=[]
    for rpt in range(5):

        for ind in range(T):
            agent.model.predict(current_obs[ind, :])


        # get label sequence
        target = np.zeros((T, agent.n_output))
        #print(actions)
        for ind in range(T):
            next_Q_value = target_agent.call(next_obs[ind,:])[:agent.n_output]
            next_Q = np.max(next_Q_value)

            Q_target = rewards[ind] + gamma * next_Q * is_done_mask[ind]

            target[ind, :] = agent.model.node_list[ind].state.h[:agent.n_output]
            target[ind,actions[ind]] = Q_target


        loss = agent.model.get_loss(target, agent.loss_layer)
        rpt_loss.append(loss)

        agent.model.Param.apply_diff(len(target), lr = 0.01)
        agent.model.x_list_clear()
        target_agent.model.x_list_clear()

    agent_mirror.model.Param = copy.deepcopy(agent.model.Param)

    return loss


In [None]:
class LSTMQModel(object):
    def __init__(self):
        super().__init__()
        self.n_output = 3
        hidden_size  = 64
        x_dim = 6
        Lstmparam = LSTMParam(hidden_size, x_dim)
        Lstmnet = LSTMnetwork(Lstmparam)
        
        self.model = Lstmnet
        self.loss_layer = Euclidean_loss_layer(n_output = self.n_output)
        
    def call(self, obs):
        self.model.predict(obs)
        idx = len(self.model.node_list)-1
        return self.model.node_list[idx].state.h
                
        #loss = Lstmnet.get_loss(y_list, loss_layer)
        
        #Lstmparam.apply_diff(len(y_list),lr = 0.1)
        #Lstmnet.x_list_clear()
        
class Memory():
    
    def __init__(self,memsize):
        self.memsize = memsize
        self.memory = collections.deque(maxlen=self.memsize)
    
    def add_episode(self,epsiode):
        self.memory.append(epsiode)
    
    def get_batch(self, batch_size, time_step):
        sampled_epsiodes = random.sample(self.memory,batch_size)
        batch = []
        for episode in sampled_epsiodes:
            point = np.random.randint(0,len(episode)+1-time_step)
            batch.append(episode[point:point+time_step])
            
        return np.stack(batch)

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [None]:
import tensorflow as tf

import itertools
import collections
import copy
import random

import argparse
import logging
import os, sys

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
"""
from RL.models.cartpole import CartpoleModel
from RL.DeepQ.Exploration import CalculateEpsilon
from RL.DeepQ.Agent import run_agent_greedy_eps, calculate_training_loss
from RL.DeepQ.utils import prepare_batch, preprocess_cartpole
"""
np.random.seed(48)
tf.random.set_seed(48)
random.seed(48)
#logger = logging.getLogger(os.path.basename(sys.argv[0]))

import gym

def play():
    # -------------------- * --------------------
    """
    argparser = argparse.ArgumentParser('DQN', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    argparser.add_argument('--replay-size', action="store", type=int, default=10000)
    argparser.add_argument('--epsilon-decay-rate', action="store", type=float, default=0.99999)
    argparser.add_argument('--gamma', action="store", type=float, default=0.99)
    argparser.add_argument('--episode-train', action="store", type=int, default=2000)
    argparser.add_argument('--batch-size', action="store", type=int, default=32)
    argparser.add_argument('--moving-target-rate', action="store", type=int, default=0.0005)

    argparser.add_argument('--learning-rate', action="store", type=float, default=2e-5)

    argparser.add_argument('--output-save-img', action="store", type=str, default=None)

    args = argparser.parse_args(argv)
    """
    replay_size = 1000
    eps_decay_rate = 0.999
    gamma = 0.99
    eps_train = 30000
    batch_size = 64
    learning_rate = 1e-4
    output_save_img = './Fig/'+'LstmDQN.pdf'
    target_rate = 0.01
    max_step = 300
    update_target_freq = 10000
    update_freq = 16
    save_freq = 100
    seq_len = 32
    
    
    # -------------------------------------------    
    agent = LSTMQModel()
    target_agent = copy.deepcopy(agent)
    #experience_replay = collections.deque([], maxlen=replay_size) ## remind to use it
    experience_replay = Memory(replay_size)
    
    epsilon = CalculateEpsilon(0.5)

    reward_curve = np.zeros(eps_train)
    loss_stat = []
    
    # ------------------Fill memory-------------------------   
    for memory_ind in range(replay_size):
        env = Maze(10, 50)
        observation = env.reset()
        observation = preprocess_observation(observation)
        step = 0
        experience_replay_eps = []
        #last_action = np.zeros(agent.n_output)
        while True:

            action_int = np.random.randint(0, 3)
            
            next_observation, reward, is_done = env.step(action_int)
            next_observation = preprocess_observation(next_observation)
            
            step += 1
            
            experience_replay_eps.append( (observation, action_int, reward, next_observation, is_done) )

            observation = next_observation
            
            if is_done == True or step>max_step:
                break
                
        experience_replay.add_episode(experience_replay_eps)
    print('Populated with %d Episodes' % len(experience_replay.memory))
        
        
    # -------------------Training------------------------ 
    total_step = 0
    
    for current_eps_num in range(eps_train):
        if current_eps_num % 1 ==0: print('.',end='')
        if current_eps_num >0 and current_eps_num%50 ==0: print(' ')
            
        
        env = Maze(10, 50)
        observation = env.reset()
        observation = preprocess_observation(observation)
        eps_reward = []
        step = 0
        experience_replay_eps = []
        
        while True:
        
            action_int, action_onehot, Q_val = run_agent_greedy_eps(agent, observation, epsilon(), env)
            
            next_observation, reward, is_done = env.step(action_int)
            next_observation = preprocess_observation(next_observation)
            
            eps_reward.append(reward)
            step += 1
            total_step += 1
            
            experience_replay_eps.append( (observation, action_int, reward, next_observation, is_done) )

            observation = next_observation
            
            #======================================Learning=========================================
            # Need To do Interpolation Later
            # if frames % 5000 == 0:
            if total_step % update_target_freq == 0:
                target_agent.model.Param.Wg = (1- target_rate) * target_agent.model.Param.Wg+ target_rate *agent.model.Param.Wg
                target_agent.model.Param.Wi = (1- target_rate) * target_agent.model.Param.Wi+ target_rate *agent.model.Param.Wi
                target_agent.model.Param.Wf = (1- target_rate) * target_agent.model.Param.Wf+ target_rate *agent.model.Param.Wf
                target_agent.model.Param.Wo = (1- target_rate) * target_agent.model.Param.Wo+ target_rate *agent.model.Param.Wo
                target_agent.model.Param.bg = (1- target_rate) * target_agent.model.Param.bg+ target_rate *agent.model.Param.bg
                target_agent.model.Param.bi = (1- target_rate) * target_agent.model.Param.bi+ target_rate *agent.model.Param.bi
                target_agent.model.Param.bf = (1- target_rate) * target_agent.model.Param.bf+ target_rate *agent.model.Param.bf
                target_agent.model.Param.bo = (1- target_rate) * target_agent.model.Param.bo+ target_rate *agent.model.Param.bo

            """
            if frames % batch_size==0 or is_done==True or frames > max_step: # 
                
                all_observation, all_actions, all_reward, \
                    all_next_observation, all_is_done = zip(*prepare_batch_seq(experience_replay_eps))
                
                all_observation = np.squeeze(all_observation)
                all_actions = np.squeeze(all_actions)
                all_reward = np.squeeze(all_reward)
                all_next_observation = np.squeeze(all_next_observation)
                all_is_done = np.squeeze(all_is_done)
           

                # Packing Everything to Tensor
                current_obs = np.stack(all_observation)
                actions = np.stack(all_actions)
                next_obs = np.stack(all_next_observation)
                rewards =  np.array(all_reward)
                is_done_mask = np.array([not a for a in all_is_done]).astype(np.int32)
                
                # Calculating Loss
                loss = calculate_training_loss(
                    agent,
                    target_agent,
                    current_obs,
                    actions,
                    next_obs,
                    rewards,
                    is_done_mask,
                    gamma
                )"""

            #if frames%500 == 0 and frames != 0:
            #    logger.info(f"At {frames}: The loss is {loss} and the epsilon is {epsilon.init_eps}")
            
            if total_step % update_freq == 0:
        
                #loss = Lstmnet.get_loss(y_list, loss_layer)
                #print("loss:", "%.3e" % loss)
                #eps_loss.append(loss)

                #Lstmparam.apply_diff(len(y_list),lr = 0.1)
                #Lstmnet.x_list_clear()
                agent_tmp = copy.deepcopy(agent)
                agent_tmp.model.x_list_clear()
                
                batch = experience_replay.get_batch(batch_size=batch_size, time_step=seq_len)
                for batch_ind, seqence in enumerate(batch):
                    
                    all_observation, all_actions, all_reward, \
                        all_next_observation, all_is_done = zip(*prepare_batch_seq(seqence))

                    all_observation = np.squeeze(all_observation)
                    all_actions = np.squeeze(all_actions)
                    all_reward = np.squeeze(all_reward)
                    all_next_observation = np.squeeze(all_next_observation)
                    all_is_done = np.squeeze(all_is_done)


                    # Packing Everything to Tensor
                    current_obs = np.stack(all_observation)
                    actions = np.stack(all_actions)
                    next_obs = np.stack(all_next_observation)
                    rewards =  np.array(all_reward)
                    is_done_mask = np.array([not a for a in all_is_done]).astype(np.int32)


                    T = len(current_obs)


                    for rpt in range(1):

                        if len(agent_tmp.model.node_list)==0:
                            for ind in range(T):
                                agent_tmp.model.predict(current_obs[ind, :])
                        
                        # get label sequence
                        if len(target_agent.model.node_list) == 0:
                            target = np.zeros((T, agent_tmp.n_output))
                            #print(actions)
                            for ind in range(T):
                                next_Q_value = target_agent.call(next_obs[ind,:])[:agent.n_output]
                                next_Q = np.max(next_Q_value)

                                Q_target = rewards[ind] + gamma * next_Q * is_done_mask[ind]
                                
                                target[ind, :] = agent_tmp.model.node_list[ind].state.h[:agent.n_output]
                                target[ind, actions[ind]] = Q_target
                
                        loss = agent_tmp.model.get_loss(target, agent_tmp.loss_layer)
                        #rpt_loss.append(loss)
                        loss_stat.append(loss)

                        agent_tmp.model.Param.apply_diff(len(target), lr = learning_rate)
                        agent_tmp.model.x_list_clear()
                        
                    target_agent.model.x_list_clear()

                agent.model.Param = copy.deepcopy(agent_tmp.model.Param)
                
            if is_done == True or step >= max_step:
                agent.model.x_list_clear()
                target_agent.model.x_list_clear()
                
                break

                
        experience_replay.add_episode(experience_replay_eps)
        
        reward_curve[current_eps_num] = np.mean(eps_reward)/env.average_reward
        
        if current_eps_num >0 and current_eps_num % save_freq == 0:
            perf = {}
            perf['loss_stat'] = loss_stat
            perf['reward_curve'] = reward_curve
            perf['experience_replay'] = experience_replay
            save_obj(name='./Fig/LSTM_POMDP',obj=perf)
            save_obj(name='./Fig/agent0', obj =agent)
            
            #print(reward_curve)
            fig, ax = plt.subplots()
            plt.plot(np.arange(len(reward_curve[:current_eps_num])), reward_curve[:current_eps_num], 'r-')
            plt.xlabel("Number of Episodes")
            plt.ylabel("Performance")
            #plt.legend() # 显示图例
            plt.savefig(output_save_img)
    
    #print(reward_curve)
    fig, ax = plt.subplots()
    plt.plot(np.arange(len(reward_curve)), reward_curve, 'r-')
    plt.xlabel("Number of Episodes")
    plt.ylabel("Performance")
    #plt.legend() # 显示图例
    if output_save_img is not None:
        plt.savefig(output_save_img)
    plt.show()
    
        
        

if __name__ == '__main__':
    #logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    #print(' '.join(sys.argv))
    play()
