In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import os, sys
import random
import pickle
import argparse
import logging
import tensorflow as tf
tf.enable_eager_execution()

np.random.seed(15)
tf.random.set_random_seed(15)
random.seed(15)

np.set_printoptions(precision=2, threshold=np.inf)

class Maze(object):
    WALL = 2
    EMPTY = 8
    LEFT = 0
    RIGHT = 1 # right or forward
    def __init__(self, width, length): 
        self.length = length
        self.width = width
        self.maze = np.ones((self.width, self.length)) * Maze.WALL

        self.generate_maze()
        
        #self.maze_mask
        #self.shortest_solutions
        self.get_shortest_solutions()
        
        #self.longest_shortest, used to calculate objective value
        self.get_longest_shortest_solutions()
        
        # used to normalize objective value
        self.best_score = self.get_attainable_score()

        
        
        
    
    def generate_maze(self):
        # generate walls, doors
        
        spaces = np.random.randint(low=1, high=4, size=self.length)
        cum_spaces = np.cumsum(spaces) # leave the first col empty
 
        for ind, val in enumerate(cum_spaces):
            if val >= self.length-1:
                self.wall_position = cum_spaces[:ind]
                break
        if self.wall_position[0] > 1:
            self.wall_position[0] = 1
        if self.wall_position[-1] < self.length-1:
            self.wall_position = np.append(self.wall_position, self.length-1)
                
        self.road_position = np.array([]).astype(np.int)
        for ind in np.arange(self.length-1):
            if ind not in self.wall_position:
                self.road_position = np.append(self.road_position, ind)
        
        for i in self.road_position:
            self.maze[1:-1,i]=Maze.EMPTY
        
        self.door_position = np.random.randint(low=1, high=self.width-1, size=len(self.wall_position))
        #print(self.door_position)
    
        # get door position
        self.door_position = np.zeros(len(self.wall_position), dtype = np.int)
        self.door_position[-1] = np.random.randint(low=1, high=self.width-1) #1~self.width-2 available door position
        for ind in np.arange(len(self.wall_position)-2, -1, -1):
            if self.wall_position[ind] == self.wall_position[ind+1] -1: # two walls together
                self.door_position[ind] = self.door_position[ind+1]
                
            else:
                self.door_position[ind] = np.random.randint(low=1, high=self.width-1)
        
        # Fill door cue
        self.maze[ self.door_position[-1], self.wall_position[-1] ] = Maze.RIGHT # default last door due
        for i in np.arange(len(self.wall_position)-2, -1, -1):
            if self.door_position[i+1] < self.door_position[i]:
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.LEFT
            else: 
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.RIGHT
                
                
                
       
                
    def print_maze(self, x=-1, y=-1):
        if x>=0 and y>=0:
            tmp = self.maze[x,y]
            self.maze[x,y] = -1 # position of the agent
            
        print("  ", end="")    
        #for i in np.arange(self.length):
        #    print('%d ' % i, end='')
        print("\n")
        
        for j in np.arange(self.width):
            print('%d ' % j, end='')
            for i in np.arange(self.length):
            
                if self.maze[j,i]==Maze.WALL: # wall position
                    print('H ',end='')
                elif self.maze[j,i]==Maze.EMPTY:
                    print('  ',end='')# road
                elif self.maze[j,i]==-1:
                    print('T ',end='')
                    self.maze[x,y]= tmp
                else:
                    print('%d ' % self.maze[j,i], end='')
            print('\n')

        
    def get_shortest_solutions(self):
        # get the shortest length to the end of maze from each layer
        
        self.maze_mask = np.zeros(self.length, dtype=np.int)
        for ind, val in enumerate(self.wall_position):
            self.maze_mask[val] = self.door_position[ind]
       
        self.shortest_solutions = np.zeros(self.length, dtype=np.int)
        step = 0
        next_wall = self.length-1
        for ind in np.arange(self.length-2, -1, -1):
            if self.maze_mask[ind] == 0: # road
                step += 1
                self.shortest_solutions[ind] = step
            else: # wall
                step += np.abs(self.maze_mask[next_wall] - self.maze_mask[ind])+1 #1 out the door, +diff for vert.
                self.shortest_solutions[ind] = step
                next_wall = ind
        

    
    def get_distance_escape(self, x, y):
        # get the shortest distance to escape from the current position
        vertical_distance = 0
        if y in self.road_position:
            for next_wall_ind in np.arange(y+1, y+4, 1):
                if next_wall_ind in self.wall_position: break
            vertical_distance = np.abs(self.maze_mask[next_wall_ind] - x)
        return self.shortest_solutions[y]+vertical_distance
                

        
    def get_longest_shortest_solutions(self):
        # get the shortest length from corner of starting to the end out maze
        left = self.get_distance_escape(1,0)
        right = self.get_distance_escape(self.width-2,0)
        
        self.longest_shortest = np.maximum(left, right)+5 # higher than true value
    
    
    def get_attainable_score(self):
        position = []
        x = self.door_position[0] # in front of the first door
        y = 0
        score = np.float32(0)
        pass_maze = 0
        door_signal=self.maze[self.door_position[0], 1]
        r=[]
        for _ in np.arange(Agent.LIFE, -1, -1):
            position.append([x,y])
            if y != self.length-1:
                r.append((self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze)
                score += (self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze
            if self.maze[x, y+1]!=Maze.WALL: # road
                y += 1
                if y in self.wall_position:
                    door_signal = self.maze[x,y]
                if y == self.length-1:
                    pass_maze += 1
                    y=0
            else: # wall
                if door_signal == 0 and self.maze[x-1,y]==Maze.WALL: # init location make door signal no more signal
                    door_signal = 1
                if door_signal == 1 and self.maze[x+1,y]==Maze.WALL:
                    door_signal = 0
                x += int(door_signal*2-1)
        
        #print(position)
        self.average_reward = np.mean(r)
     
        return score


In [None]:
class Agent:
    LIFE = 300
    num_inputs = 6
    num_memory = 1
    num_outputs = 2
    brain_size = num_inputs + num_memory + num_outputs
    
    def __init__(self, maze):
        
        self.maze = maze
        self.brain_size = Agent.brain_size
        self.brain = np.zeros(self.brain_size)
        self.score = np.float32(0)
        
        self.input_ids=[]
        self.output_ids=[]
        self.gates = []
        
        self.best_input_ids=[[0,3,4,5,6]]
        self.best_output_ids=[[6,7,8]]
        self.best_gates =[]        
        self.best_gates.append(np.array([[0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [1,0,0,0,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [1,0,0,0,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [1,0,0,0,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [1,0,0,0,0,0,0,0]]))
        
        
        
        self.end = False # reach the end of maze
        self.time_step = 0 # +1 for every move
        self.thinking_times = 0 # +1 for every step
        self.life = Agent.LIFE
        self.pass_maze = 0
        
        #self.position = np.array([self.maze.door_position[0], 0]) # in front of the first door
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.trajectory = np.ones((self.life, 2))*-1
        self.trajectory[self.time_step,:] = self.position
        
        self.door_direction()
        self.perception()
  
        
    def human_brain_update(self):
        x,y = self.position
        if y == self.maze.length-1: # reach the end of the maze
            self.pass_maze = self.pass_maze + 1
            self.init_locate()
       
        if self.brain[4]==1 and self.brain[5]==1:
            self.brain[6] = self.brain[3] # sure it is a due
             
            
        if self.brain[0] == 0:
            self.brain[7], self.brain[8] = 1,1
        else:
            if self.brain[6] == 0: # turn left
                if self.brain[4] == 0: # no wall
                    self.brain[7], self.brain[8] = 0,1
                else: #wall
                    self.brain[7], self.brain[8] = 1,0
                    self.brain[6]=1
   
                
            else: # turn right
                if self.brain[5] == 0:
                    self.brain[7], self.brain[8] = 1,0
                else:
                    self.brain[7], self.brain[8] = 0,1
                    self.brain[6]=0

    
    def brain_update(self):
        # differ with gate
        
        
        next_brain = np.copy(self.brain)
        #next_brain[6:] = 0
        
        all_outputs_idx = np.array([])
        for gate_output in self.output_ids:
            all_outputs_idx = np.concatenate((all_outputs_idx, gate_output))
        all_outputs_idx = np.unique(all_outputs_idx).astype(int)
        next_brain[all_outputs_idx] = 0
            

        if np.random.rand()>0.0:
            
            for gate, input_ids, output_ids in zip(self.gates, self.input_ids, self.output_ids):

                mg_input_index, marker = 0, 1
                # Create an integer from bytes representation (loop is faster than previous implementation)
                for mg_input_id in input_ids:
                    if self.brain[mg_input_id]:
                        mg_input_index += marker
                    marker *= 2
                
                # Determine the corresponding output values for this Markov Gate
                markov_gate_subarray = gate[mg_input_index,:]  # selects a Markov Gate subarray

                mg_output_index = np.random.choice(len(markov_gate_subarray),p = markov_gate_subarray)
                # Converts the index into a string of '1's and '0's (binary representation)
                mg_output_values = np.binary_repr(mg_output_index, width=len(output_ids))  # bin() is much faster than np.binaryrepr()

                # Loops through 'mg_output_values' and alter 'self.states'
                for i, mg_output_value in enumerate(mg_output_values[:]):
                    if mg_output_value == '1':
                        next_brain[output_ids[len(output_ids)-1 -i]] = 1   #.astype(np.int32)

            
        else:
            next_brain[6:] = np.random.randint(2,size=3)

        # Replace original input values
        self.brain = np.copy(next_brain)


    # reinit when the genome has no changes, used in fitness evaluation
    def simple_reinit(self):
        
        #self.brain[:6] = 0
        #self.brain[10:]=0 # keep hidden nodes' state
        self.brain = np.zeros(self.brain_size)
        self.score = np.float32(0)
 
        self.end = False # reach the end of maze
        self.time_step = 0 # +1 for every move
        self.thinking_times = 0 # +1 for every step
        #self.life = np.maximum(300, 10*self.maze.length)
        self.life = Agent.LIFE
        self.pass_maze = 0
        
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.trajectory = np.ones((self.life, 2))*-1
        self.trajectory[self.time_step,:] = self.position
        
        self.door_direction()
        self.perception()

          
        
        

        
        
    def init_locate(self):
        # if the agent reaches the end of maze, pull it back to the origin
        
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.end = False
    
        self.brain[:Agent.num_inputs] = 0 # reset brain
        self.brain[Agent.num_inputs+Agent.num_memory:]=0 # keep hidden nodes' state
        
        self.door_direction()
        self.perception()
    

        
    def door_direction(self):
        # let the agent know the first door's position
        pass
        """
        next_wall = self.maze.wall_position[0] # the first wall
        left = self.maze.maze[1:self.position[0], next_wall]
        right = self.maze.maze[self.position[0]:self.maze.width-1, next_wall]
        
        for land in left:
            if land != Maze.WALL: 
                self.brain[3] = 0
                break
        for land in right:
            if land != Maze.WALL: 
                self.brain[3] = 1
                break
        """
                
    def perception(self):
        x,y = self.position
        #print("x=%d, y=%d", (x,y))
        # reset agent's input before set new values
        #self.brain[0:3] = 0
        #self.brain[4:6] =0
        self.brain[:Agent.num_inputs]=0
        
        if self.maze.maze[x,y+1] == Maze.WALL:
            self.brain[0]=1
        else: self.brain[0]=0
        
        if self.maze.maze[x-1,y+1] == Maze.WALL:
            self.brain[1]=1
        else: self.brain[1]=0
        
        if self.maze.maze[x+1,y+1] == Maze.WALL:
            self.brain[2] = 1
        else: self.brain[2]=0
        
        if self.maze.maze[x-1,y] == Maze.WALL:
            self.brain[4]=1
        else: self.brain[4]=0
        
        if self.maze.maze[x+1,y] == Maze.WALL:
            self.brain[5]=1
        else: self.brain[5]=0
        
        if y in self.maze.wall_position:
            self.brain[3] = self.maze.maze[x, y]
        
    def random_walk(self, x, y):
        # implements exploration
        feasible = []

        if self.maze.maze[x-1, y] != Maze.WALL:
            feasible.append([x-1, y])
        if self.maze.maze[x+1, y] != Maze.WALL:
            feasible.append([x+1, y])
        if self.maze.maze[x, y+1] != Maze.WALL:
            feasible.append([x, y+1])

        if len(feasible)>0:
            idx = np.random.randint(len(feasible))
            return feasible[idx]
        else:
            return [x,y]
        
    
            
    
    def step(self):
        x,y = self.position
        r = (self.maze.longest_shortest - self.maze.get_distance_escape(x,y))/self.maze.longest_shortest + self.pass_maze
        self.score +=  r
        #print("x=%d, y=%d, escape_distance=%d, score=%f " % (x,y,agent.maze.get_distance_escape(x,y), agent.score))
        #print("value=%f ", (agent.maze.longest_shortest - agent.maze.get_distance_escape(x,y))/agent.maze.longest_shortest)
        
        
        fitness = 0
        time_step_shot = self.time_step
        self.thinking_times = self.thinking_times + 1
  
        
        if self.thinking_times>self.life-1:# or self.thinking_times >= 3000: 
            self.end = True
            fitness = self.get_fitness()
            self.fitness = fitness
     
        #if np.random.rand()>0.5:

        elif self.brain[Agent.num_inputs+Agent.num_memory] == 1 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 0:
            if self.maze.maze[x+1,y]==Maze.WALL:
                r-= 1
            #    self.brain[Agent.num_inputs+Agent.num_memory] = 0
            #    self.brain[Agent.num_inputs+Agent.num_memory+1] = 1
            #else:

            if  self.maze.maze[x+1,y] != Maze.WALL:
                self.position = x+1, y
                self.time_step = self.time_step+1

        elif self.brain[Agent.num_inputs+Agent.num_memory] == 0 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 1:
            if self.maze.maze[x-1,y] == Maze.WALL:
                r-= 1
            #    self.brain[Agent.num_inputs+Agent.num_memory] = 1
            #    self.brain[Agent.num_inputs+Agent.num_memory+1] = 0
            #else:

            if  self.maze.maze[x-1,y] != Maze.WALL:
                self.position = x-1, y
                self.time_step = self.time_step+1


        elif self.brain[Agent.num_inputs+Agent.num_memory] == 1 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 1 \
        or self.brain[Agent.num_inputs+Agent.num_memory] == 0 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 0:
        

            if self.maze.maze[x,y+1] != Maze.WALL:
                #r+=20
                self.position = x,y+1
                self.time_step = self.time_step+1
            else:
                r-= 1

            """
            elif y in self.maze.wall_position: # in a door
                self.position = x,y+1
                self.time_step = self.time_step+1
            elif y+1 in self.maze.wall_position and self.maze.maze[x,y+1]!=2: # before a door
                #print('before a door >;<')
                self.position = x,y+1
                self.time_step = self.time_step+1
            """
            x,y = self.position
            if y == self.maze.length-1: # reach the end of the maze
                self.pass_maze = self.pass_maze + 1
                self.init_locate()

        #elif self.brain[Agent.num_inputs+Agent.num_memory] == 0 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 0:
        #    if self.maze.maze[x,y+1] != Maze.WALL:
        #        self.position = x,y+1
        #        self.time_step = self.time_step+1
        #    else:
        #        r-= 15

            """elif y in self.maze.wall_position: # in a door
                self.position = x,y+1
                self.time_step = self.time_step+1
            elif y+1 in self.maze.wall_position and self.maze.maze[x,y+1]!=2: # before a door
                #print('before a door >;<')
                self.position = x,y+1
                self.time_step = self.time_step+1
            """

          #  x,y = self.position
          #  if y == self.maze.length-1: # reach the end of the maze
          #      self.pass_maze = self.pass_maze + 1
          #      self.init_locate()
                
            #self.position = x,y
            #self.time_step = self.time_step+1
            '''else:
                # shouldn't have this
                self.brain[10] = 1
                self.brain[11] = 0
            '''    
        '''elif self.brain[10] == 0 and self.brain[11] == 0:
            self.brain[10] = 0
            self.brain[11] = 1
        ''' 
        """else:
            xx, yy = self.random_walk(x,y)
            self.position = xx, yy
            self.time_step = self.time_step+1
            
            x,y = self.position
            if y == self.maze.length-1: # reach the end of the maze
                self.pass_maze = self.pass_maze + 1
                self.init_locate()
"""
            
        # if the brain's order is legal, keep it
        # illegal order is omitted
        if self.time_step > time_step_shot:    
            self.trajectory[self.time_step,:] = self.position
        
        return fitness, r
    
    def get_fitness(self):
        
        return self.score/self.maze.best_score 
    
    def best_brain_update(self):
    
        next_brain = np.copy(self.brain)
        
        all_outputs_idx = np.array([])
        for gate_output in self.best_output_ids:
            all_outputs_idx = np.concatenate((all_outputs_idx, gate_output))
        all_outputs_idx = np.unique(all_outputs_idx).astype(int)
        next_brain[all_outputs_idx] = 0
        
        for gate, input_ids, output_ids in zip(self.best_gates, self.best_input_ids, self.best_output_ids):

            mg_input_index, marker = 0, 1
            # Create an integer from bytes representation (loop is faster than previous implementation)
 
            for mg_input_id in input_ids:
                if self.brain[mg_input_id]:
                    mg_input_index += marker
                marker *= 2
           
            # Determine the corresponding output values for this Markov Gate
            markov_gate_subarray = gate[mg_input_index,:]  # selects a Markov Gate subarray
            
            #print(self.brain[input_ids], mg_input_index, markov_gate_subarray)

            mg_output_index = np.random.choice(len(markov_gate_subarray),p = markov_gate_subarray)
            # Converts the index into a string of '1's and '0's (binary representation)
            mg_output_values = np.binary_repr(mg_output_index, width=len(output_ids)) #bin(mg_output_index)  # bin() is much faster than np.binaryrepr()
            #print(mg_output_values)
            # Loops through 'mg_output_values' and alter 'self.states'
           
            for i, mg_output_value in enumerate(mg_output_values[:]):
                if mg_output_value == '1':
                    next_brain[output_ids[len(output_ids)-1 -i]] = 1   #.astype(np.int32)
        self.brain = np.copy(next_brain)
    

    def num2action(self, num):
        numbers = {
            0 : '011',
            1 : '100',
            2 : '110',
            3 : '111'
        }
        return numbers.get(num, None)
    
    def brain_update_condense(self):
        # differ with gate
        next_brain = np.copy(self.brain)
        #next_brain[6:] = 0
        
        all_outputs_idx = np.array([])
        for gate_output in self.output_ids:
            all_outputs_idx = np.concatenate((all_outputs_idx, gate_output))
        all_outputs_idx = np.unique(all_outputs_idx).astype(int)
        next_brain[all_outputs_idx] = 0
            


        for gate, input_ids, output_ids in zip(self.gates, self.input_ids, self.output_ids):

            mg_input_index, marker = 0, 1
            # Create an integer from bytes representation (loop is faster than previous implementation)
            for mg_input_id in input_ids:
                if self.brain[mg_input_id]:
                    mg_input_index += marker
                marker *= 2

            # Determine the corresponding output values for this Markov Gate
            markov_gate_subarray = gate[mg_input_index,:]  # selects a Markov Gate subarray

            mg_output_index = np.random.choice(len(markov_gate_subarray),p = markov_gate_subarray)
            
            mg_output_values = self.num2action(mg_output_index)

            # Loops through 'mg_output_values' and alter 'self.states'
            for i, mg_output_value in enumerate(mg_output_values[:]):
                if mg_output_value == '1':
                    next_brain[output_ids[len(output_ids)-1 -i]] = 1   #.astype(np.int32)


        # Replace original input values
        self.brain = np.copy(next_brain)



        
def test():        
    maze = Maze(10, 50)
    maze.print_maze()
    print(maze.best_score)

    agent = Agent(maze)
    agent.input_ids = agent.best_input_ids
    agent.output_ids = agent.best_output_ids
    
    for i in range(100):
        for input_ids, output_ids in zip(agent.input_ids, agent.output_ids):
            gate = np.random.rand(2**len(input_ids), 2**2)
            gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]
            #print(gate)
            agent.gates.append(gate)
            
        #agent.gates = agent.best_gates
        agent.simple_reinit()
        while (agent.end == False):
            agent.perception()
            #print("step :", agent.position, agent.brain)
            agent.brain_update_condense()
            fitness,r = agent.step()


        print(fitness)
        #print(agent.trajectory)
        #print(agent.gene)

    
np.random.seed(1)       
#test()
        
                
    
#np.random.seed(9)
#import cProfile
#cProfile.run('test()')

In [None]:

np.set_printoptions(precision=6)

def Draw(fitness):
    plt.plot(np.arange(len(fitness)), fitness, color='green', label='fitness trend',linestyle = '-')
    plt.xlabel("Episodes")
    plt.ylabel("Fitness")
    plt.legend() # 显示图例
    plt.show()

def num2action(num):
    # set the output nodes according to the action
    # used when have an action and want to set the output nodes
        numbers = {
            0 : '011',
            1 : '100',
            2 : '110',
            3 : '111'
        }
        return numbers.get(num, None)
    
def action2num(action):
    # get the action according to the output nodes
    # used when have the action and want to get the action index and update q_table
    numbers = {
            '011':0,
            '100':1,
            '110':2,
            '111':3
        }
    return numbers.get(action, None)
    
    
def observation2index(observation): 
    # observation: array
    # get the state index in the q-table
    input_val, marker = 0, 1
        
    for val in observation: # 03456
        if val == 1:
            input_val += marker
        marker *= 2
    return int(input_val)
    
    
class QLearningTable:
    def __init__(self, N_input_ids, N_output_ids, learning_rate=0.01, reward_decay=0.98, e_greedy=0.5):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = np.zeros((2**N_input_ids, 2**2))
        #pd.DataFrame(columns=self.actions, dtype=np.float64)
    

    def choose_action(self, observation):
        # action selection
        if np.random.uniform() < self.epsilon: # choose best action
            
            input_val = observation2index(observation)
            
            state_action = self.q_table[input_val, :]
            # some actions may have the same value, randomly choose on in these actions
            max_index = np.argwhere(state_action == np.max(state_action)).flatten().tolist()
            output_val = np.random.choice(max_index)
            #print(observation, input_val, state_action, max_index, output_val, num2action(output_val))
        else:
            # choose random action
            output_val = np.random.choice(2**2)
            
        
        
            
        return num2action(output_val)

    def learn(self, s, a, r, s_):
        input_val = observation2index(s)
        action = action2num(str(a))
        q_predict = self.q_table[input_val, action]
        
        next_input_val = observation2index(s_)
        #if s_ != 'terminal':
        q_target = r + self.gamma * self.q_table[next_input_val, :].max()  # next state is not terminal
        #else:
        #    q_target = r  # next state is terminal
        self.q_table[input_val, action] += self.lr * (q_target - q_predict)  # update




    
actions = ['011','100','110','111']
def update():
    maze = Maze(10,50)
    maze.print_maze()
    agent = Agent(maze)
    observation_index = agent.best_input_ids[0]
    action_index = agent.best_output_ids[0]
    
    RL = QLearningTable(len(observation_index), len(action_index))
    
    experience_buffer = []
    
    Num_episode = 3000
    fitness = np.zeros(Num_episode)
    average_reward = np.zeros(Num_episode)
    
    for episode in range(Num_episode):
        maze = Maze(10,50)
        agent.maze = maze
        agent.simple_reinit()
        rewards = []
        while agent.end==False:
            agent.perception()
            observation = agent.brain[observation_index]
            if (episode == Num_episode-1):
                print("step :", agent.position, agent.brain)
            #agent.brain_update_condense()
            action = RL.choose_action(observation)

            agent.brain[action_index] = 0
            for ind, i in enumerate(action):
                if i == '1':
                    agent.brain[8-ind] = 1
                    

            fitness[episode], reward = agent.step()
            rewards.append(reward)
            observation_ = agent.brain[observation_index]
   
                

       

            # RL learn from this transition
            RL.learn(observation, action, reward, observation_)

            # swap observation
            observation = observation_
        average_reward[episode] = np.mean(rewards)

    print(np.stack(rewards))
    Draw(fitness)
    Draw(average_reward)
    print(RL.q_table)
    # end of game
    print('game over')

np.random.seed(0)
update()


