In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import os, sys
import random
import pickle
import argparse
import logging
import tensorflow as tf
tf.enable_eager_execution()

np.random.seed(15)
tf.random.set_random_seed(15)
random.seed(15)

np.set_printoptions(precision=2, threshold=np.inf)

class Maze(object):
    WALL = 2
    EMPTY = 8
    LEFT = 0
    RIGHT = 1 # right or forward
    def __init__(self, width, length): 
        self.length = length
        self.width = width
        self.maze = np.ones((self.width, self.length)) * Maze.WALL

        self.generate_maze()
        
        #self.maze_mask
        #self.shortest_solutions
        self.get_shortest_solutions()
        
        #self.longest_shortest, used to calculate objective value
        self.get_longest_shortest_solutions()
        
        # used to normalize objective value
        self.best_score = self.get_attainable_score()

        
        
        
    
    def generate_maze(self):
        # generate walls, doors
        
        spaces = np.random.randint(low=1, high=4, size=self.length)
        cum_spaces = np.cumsum(spaces) # leave the first col empty
 
        for ind, val in enumerate(cum_spaces):
            if val >= self.length-1:
                self.wall_position = cum_spaces[:ind]
                break
        if self.wall_position[0] > 1:
            self.wall_position[0] = 1
        if self.wall_position[-1] < self.length-1:
            self.wall_position = np.append(self.wall_position, self.length-1)
                
        self.road_position = np.array([]).astype(np.int)
        for ind in np.arange(self.length-1):
            if ind not in self.wall_position:
                self.road_position = np.append(self.road_position, ind)
        
        for i in self.road_position:
            self.maze[1:-1,i]=Maze.EMPTY
        
        self.door_position = np.random.randint(low=1, high=self.width-1, size=len(self.wall_position))
        #print(self.door_position)
    
        # get door position
        self.door_position = np.zeros(len(self.wall_position), dtype = np.int)
        self.door_position[-1] = np.random.randint(low=1, high=self.width-1) #1~self.width-2 available door position
        for ind in np.arange(len(self.wall_position)-2, -1, -1):
            if self.wall_position[ind] == self.wall_position[ind+1] -1: # two walls together
                self.door_position[ind] = self.door_position[ind+1]
                
            else:
                self.door_position[ind] = np.random.randint(low=1, high=self.width-1)
        
        # Fill door cue
        self.maze[ self.door_position[-1], self.wall_position[-1] ] = Maze.RIGHT # default last door due
        for i in np.arange(len(self.wall_position)-2, -1, -1):
            if self.door_position[i+1] < self.door_position[i]:
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.LEFT
            else: 
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.RIGHT
                
                
                
       
                
    def print_maze(self, x=-1, y=-1):
        if x>=0 and y>=0:
            tmp = self.maze[x,y]
            self.maze[x,y] = -1 # position of the agent
            
        print("  ", end="")    
        #for i in np.arange(self.length):
        #    print('%d ' % i, end='')
        print("\n")
        
        for j in np.arange(self.width):
            print('%d ' % j, end='')
            for i in np.arange(self.length):
            
                if self.maze[j,i]==Maze.WALL: # wall position
                    print('H ',end='')
                elif self.maze[j,i]==Maze.EMPTY:
                    print('  ',end='')# road
                elif self.maze[j,i]==-1:
                    print('T ',end='')
                    self.maze[x,y]= tmp
                else:
                    print('%d ' % self.maze[j,i], end='')
            print('\n')

        
    def get_shortest_solutions(self):
        # get the shortest length to the end of maze from each layer
        
        self.maze_mask = np.zeros(self.length, dtype=np.int)
        for ind, val in enumerate(self.wall_position):
            self.maze_mask[val] = self.door_position[ind]
       
        self.shortest_solutions = np.zeros(self.length, dtype=np.int)
        step = 0
        next_wall = self.length-1
        for ind in np.arange(self.length-2, -1, -1):
            if self.maze_mask[ind] == 0: # road
                step += 1
                self.shortest_solutions[ind] = step
            else: # wall
                step += np.abs(self.maze_mask[next_wall] - self.maze_mask[ind])+1 #1 out the door, +diff for vert.
                self.shortest_solutions[ind] = step
                next_wall = ind
        

    
    def get_distance_escape(self, x, y):
        # get the shortest distance to escape from the current position
        vertical_distance = 0
        if y in self.road_position:
            for next_wall_ind in np.arange(y+1, y+4, 1):
                if next_wall_ind in self.wall_position: break
            vertical_distance = np.abs(self.maze_mask[next_wall_ind] - x)
        return self.shortest_solutions[y]+vertical_distance
                

        
    def get_longest_shortest_solutions(self):
        # get the shortest length from corner of starting to the end out maze
        left = self.get_distance_escape(1,0)
        right = self.get_distance_escape(self.width-2,0)
        
        self.longest_shortest = np.maximum(left, right)+5 # higher than true value
    
    
    def get_attainable_score(self):
        position = []
        x = self.door_position[0] # in front of the first door
        y = 0
        score = np.float32(0)
        pass_maze = 0
        door_signal=self.maze[self.door_position[0], 1]
        r=[]
        for _ in np.arange(Agent.LIFE, -1, -1):
            position.append([x,y])
            if y != self.length-1:
                r.append((self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze)
                score += (self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze
            if self.maze[x, y+1]!=Maze.WALL: # road
                y += 1
                if y in self.wall_position:
                    door_signal = self.maze[x,y]
                if y == self.length-1:
                    pass_maze += 1
                    y=0
            else: # wall
                if door_signal == 0 and self.maze[x-1,y]==Maze.WALL: # init location make door signal no more signal
                    door_signal = 1
                if door_signal == 1 and self.maze[x+1,y]==Maze.WALL:
                    door_signal = 0
                x += int(door_signal*2-1)
        
        #print(position)
        self.average_reward = np.mean(r)
     
        return score


In [None]:
class Agent:
    LIFE = 300
    num_inputs = 6
    num_memory = 1
    num_outputs = 2
    brain_size = num_inputs + num_memory + num_outputs
    
    def __init__(self, maze):
        
        self.maze = maze
        self.brain_size = Agent.brain_size
        self.brain = np.zeros(self.brain_size)
        self.score = np.float32(0)
        
        self.input_ids=[]
        self.output_ids=[]
        self.gates = []
        
        self.best_input_ids=[[0,3,4,5,6]]
        self.best_output_ids=[[6,7,8]]
        self.best_gates =[]
        self.best_gates.append(np.array([[0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [1,0,0,0,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [1,0,0,0,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [0,0,0,0,1,0,0,0],
                            [0,0,0,0,0,0,1,0],
                            [1,0,0,0,0,0,0,0],
                            [0,0,0,0,0,0,0,1],
                            [1,0,0,0,0,0,0,0]]))
        
        
        
        self.end = False # reach the end of maze
        self.time_step = 0 # +1 for every move
        self.thinking_times = 0 # +1 for every step
        self.life = Agent.LIFE
        self.pass_maze = 0
        
        #self.position = np.array([self.maze.door_position[0], 0]) # in front of the first door
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.trajectory = np.ones((self.life, 2))*-1
        self.trajectory[self.time_step,:] = self.position
        
        self.door_direction()
        self.perception()
  
        
    def human_brain_update(self):
        x,y = self.position
        if y == self.maze.length-1: # reach the end of the maze
            self.pass_maze = self.pass_maze + 1
            self.init_locate()
       
        if self.brain[4]==1 and self.brain[5]==1:
            self.brain[6] = self.brain[3] # sure it is a due
             
            
        if self.brain[0] == 0:
            self.brain[7], self.brain[8] = 1,1
        else:
            if self.brain[6] == 0: # turn left
                if self.brain[4] == 0: # no wall
                    self.brain[7], self.brain[8] = 0,1
                else: #wall
                    self.brain[7], self.brain[8] = 1,0
                    self.brain[6]=1
   
                
            else: # turn right
                if self.brain[5] == 0:
                    self.brain[7], self.brain[8] = 1,0
                else:
                    self.brain[7], self.brain[8] = 0,1
                    self.brain[6]=0

    
    def brain_update(self):
        # differ with gate
        
        
        next_brain = np.copy(self.brain)
        #next_brain[6:] = 0
        
        all_outputs_idx = np.array([])
        for gate_output in self.output_ids:
            all_outputs_idx = np.concatenate((all_outputs_idx, gate_output))
        all_outputs_idx = np.unique(all_outputs_idx).astype(int)
        next_brain[all_outputs_idx] = 0
            

        if np.random.rand()>0.0:
            
            for gate, input_ids, output_ids in zip(self.gates, self.input_ids, self.output_ids):

                mg_input_index, marker = 0, 1
                # Create an integer from bytes representation (loop is faster than previous implementation)
                for mg_input_id in input_ids:
                    if self.brain[mg_input_id]:
                        mg_input_index += marker
                    marker *= 2
                
                # Determine the corresponding output values for this Markov Gate
                markov_gate_subarray = gate[mg_input_index,:]  # selects a Markov Gate subarray

                mg_output_index = np.random.choice(len(markov_gate_subarray),p = markov_gate_subarray)
                # Converts the index into a string of '1's and '0's (binary representation)
                mg_output_values = np.binary_repr(mg_output_index, width=len(output_ids))  # bin() is much faster than np.binaryrepr()

                # Loops through 'mg_output_values' and alter 'self.states'
                for i, mg_output_value in enumerate(mg_output_values[:]):
                    if mg_output_value == '1':
                        next_brain[output_ids[len(output_ids)-1 -i]] = 1   #.astype(np.int32)

            
        else:
            next_brain[6:] = np.random.randint(2,size=3)

        # Replace original input values
        self.brain = np.copy(next_brain)


                
        
    # reinit when the genome has no changes, used in fitness evaluation
    def simple_reinit(self):
        
        #self.brain[:6] = 0
        #self.brain[10:]=0 # keep hidden nodes' state
        self.brain = np.zeros(self.brain_size)
        self.score = np.float32(0)
 
        self.end = False # reach the end of maze
        self.time_step = 0 # +1 for every move
        self.thinking_times = 0 # +1 for every step
        #self.life = np.maximum(300, 10*self.maze.length)
        self.life = Agent.LIFE
        self.pass_maze = 0
        
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.trajectory = np.ones((self.life, 2))*-1
        self.trajectory[self.time_step,:] = self.position
        
        self.door_direction()
        self.perception()

          
        
        

        
        
    def init_locate(self):
        # if the agent reaches the end of maze, pull it back to the origin
        
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.end = False
    
        self.brain[:Agent.num_inputs] = 0 # reset brain
        self.brain[Agent.num_inputs+Agent.num_memory:]=0 # keep hidden nodes' state
        
        self.door_direction()
        self.perception()
    

        
    def door_direction(self):
        # let the agent know the first door's position
        pass
        """
        next_wall = self.maze.wall_position[0] # the first wall
        left = self.maze.maze[1:self.position[0], next_wall]
        right = self.maze.maze[self.position[0]:self.maze.width-1, next_wall]
        
        for land in left:
            if land != Maze.WALL: 
                self.brain[3] = 0
                break
        for land in right:
            if land != Maze.WALL: 
                self.brain[3] = 1
                break
        """
                
    def perception(self):
        x,y = self.position
        #print("x=%d, y=%d", (x,y))
        # reset agent's input before set new values
        #self.brain[0:3] = 0
        #self.brain[4:6] =0
        self.brain[:Agent.num_inputs]=0
        
        if self.maze.maze[x,y+1] == Maze.WALL:
            self.brain[0]=1
        else: self.brain[0]=0
        
        if self.maze.maze[x-1,y+1] == Maze.WALL:
            self.brain[1]=1
        else: self.brain[1]=0
        
        if self.maze.maze[x+1,y+1] == Maze.WALL:
            self.brain[2] = 1
        else: self.brain[2]=0
        
        if self.maze.maze[x-1,y] == Maze.WALL:
            self.brain[4]=1
        else: self.brain[4]=0
        
        if self.maze.maze[x+1,y] == Maze.WALL:
            self.brain[5]=1
        else: self.brain[5]=0
        
        if y in self.maze.wall_position:
            self.brain[3] = self.maze.maze[x, y]
        
    def random_walk(self, x, y):
        # implements exploration
        feasible = []

        if self.maze.maze[x-1, y] != Maze.WALL:
            feasible.append([x-1, y])
        if self.maze.maze[x+1, y] != Maze.WALL:
            feasible.append([x+1, y])
        if self.maze.maze[x, y+1] != Maze.WALL:
            feasible.append([x, y+1])

        if len(feasible)>0:
            idx = np.random.randint(len(feasible))
            return feasible[idx]
        else:
            return [x,y]
        
    
            
    
    def step(self):
        x,y = self.position
        r = (self.maze.longest_shortest - self.maze.get_distance_escape(x,y))/self.maze.longest_shortest + self.pass_maze
        self.score +=  r
        #print("x=%d, y=%d, escape_distance=%d, score=%f " % (x,y,agent.maze.get_distance_escape(x,y), agent.score))
        #print("value=%f ", (agent.maze.longest_shortest - agent.maze.get_distance_escape(x,y))/agent.maze.longest_shortest)
        
        
        fitness = 0
        time_step_shot = self.time_step
        self.thinking_times = self.thinking_times + 1
  
        
        if self.thinking_times>self.life-1:# or self.thinking_times >= 3000: 
            self.end = True
            fitness = self.get_fitness()
            self.fitness = fitness
     
        #if np.random.rand()>0.5:

        elif self.brain[Agent.num_inputs+Agent.num_memory] == 1 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 0:
            #if self.maze.maze[x+1,y]==Maze.WALL:
            #    self.brain[Agent.num_inputs+Agent.num_memory] = 0
            #    self.brain[Agent.num_inputs+Agent.num_memory+1] = 1
            #else:

            if  self.maze.maze[x+1,y] != Maze.WALL:
                self.position = x+1, y
                self.time_step = self.time_step+1

        elif self.brain[Agent.num_inputs+Agent.num_memory] == 0 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 1:
            #if self.maze.maze[x-1,y] == Maze.WALL:
            #    self.brain[Agent.num_inputs+Agent.num_memory] = 1
            #    self.brain[Agent.num_inputs+Agent.num_memory+1] = 0
            #else:

            if  self.maze.maze[x-1,y] != Maze.WALL:
                self.position = x-1, y
                self.time_step = self.time_step+1


        elif self.brain[Agent.num_inputs+Agent.num_memory] == 1 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 1:

            if self.maze.maze[x,y+1] != Maze.WALL:
                self.position = x,y+1
                self.time_step = self.time_step+1

            """
            elif y in self.maze.wall_position: # in a door
                self.position = x,y+1
                self.time_step = self.time_step+1
            elif y+1 in self.maze.wall_position and self.maze.maze[x,y+1]!=2: # before a door
                #print('before a door >;<')
                self.position = x,y+1
                self.time_step = self.time_step+1
            """
            x,y = self.position
            if y == self.maze.length-1: # reach the end of the maze
                self.pass_maze = self.pass_maze + 1
                self.init_locate()

        elif self.brain[Agent.num_inputs+Agent.num_memory] == 0 and self.brain[Agent.num_inputs+Agent.num_memory+1] == 0:
            if self.maze.maze[x,y+1] != Maze.WALL:
                self.position = x,y+1
                self.time_step = self.time_step+1

            elif y in self.maze.wall_position: # in a door
                self.position = x,y+1
                self.time_step = self.time_step+1
            elif y+1 in self.maze.wall_position and self.maze.maze[x,y+1]!=2: # before a door
                #print('before a door >;<')
                self.position = x,y+1
                self.time_step = self.time_step+1

            x,y = self.position
            if y == self.maze.length-1: # reach the end of the maze
                self.pass_maze = self.pass_maze + 1
                self.init_locate()
                
            #self.position = x,y
            #self.time_step = self.time_step+1
            '''else:
                # shouldn't have this
                self.brain[10] = 1
                self.brain[11] = 0
            '''    
        '''elif self.brain[10] == 0 and self.brain[11] == 0:
            self.brain[10] = 0
            self.brain[11] = 1
        ''' 
        """else:
            xx, yy = self.random_walk(x,y)
            self.position = xx, yy
            self.time_step = self.time_step+1
            
            x,y = self.position
            if y == self.maze.length-1: # reach the end of the maze
                self.pass_maze = self.pass_maze + 1
                self.init_locate()
"""
            
        # if the brain's order is legal, keep it
        # illegal order is omitted
        if self.time_step > time_step_shot:    
            self.trajectory[self.time_step,:] = self.position
        
        return fitness, r
    
    def get_fitness(self):
        
        return self.score/self.maze.best_score 
    
    def best_brain_update(self):
    
        next_brain = np.copy(self.brain)
        
        all_outputs_idx = np.array([])
        for gate_output in self.best_output_ids:
            all_outputs_idx = np.concatenate((all_outputs_idx, gate_output))
        all_outputs_idx = np.unique(all_outputs_idx).astype(int)
        next_brain[all_outputs_idx] = 0
        
        for gate, input_ids, output_ids in zip(self.best_gates, self.best_input_ids, self.best_output_ids):

            mg_input_index, marker = 0, 1
            # Create an integer from bytes representation (loop is faster than previous implementation)
 
            for mg_input_id in input_ids:
                if self.brain[mg_input_id]:
                    mg_input_index += marker
                marker *= 2
           
            # Determine the corresponding output values for this Markov Gate
            markov_gate_subarray = gate[mg_input_index,:]  # selects a Markov Gate subarray
            
            #print(self.brain[input_ids], mg_input_index, markov_gate_subarray)

            mg_output_index = np.random.choice(len(markov_gate_subarray),p = markov_gate_subarray)
            # Converts the index into a string of '1's and '0's (binary representation)
            mg_output_values = np.binary_repr(mg_output_index, width=len(output_ids)) #bin(mg_output_index)  # bin() is much faster than np.binaryrepr()
            #print(mg_output_values)
            # Loops through 'mg_output_values' and alter 'self.states'
           
            for i, mg_output_value in enumerate(mg_output_values[:]):
                if mg_output_value == '1':
                    next_brain[output_ids[len(output_ids)-1 -i]] = 1   #.astype(np.int32)
        self.brain = np.copy(next_brain)


        
def test():        
    maze = Maze(10, 30)
    maze.print_maze()
    print(maze.best_score)

    agent = Agent(maze)
    
    for i in range(1):
        for input_ids, output_ids in zip(agent.input_ids, agent.output_ids):
            gate = np.random.rand(2**len(input_ids), 2**len(output_ids))
            gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]
            agent.gates.append(gate)
            
       
        agent.simple_reinit()
        while (agent.end == False):
            agent.perception()
            #print("step :", agent.position, agent.brain)
            agent.best_brain_update()
            fitness,r = agent.step()


        print(fitness)
        print(agent.trajectory)
        #print(agent.gene)

    
np.random.seed(5)       
test()
        
                
    
#np.random.seed(9)
#import cProfile
#cProfile.run('test()')

In [None]:
import pickle
import matplotlib.pyplot as plt
np.set_printoptions(precision=2)

np.random.seed(0)


num_generation = 200
pop_size = 50



k=3
fit_times = 1


def init_pop_agent_deter(pop_size, agent):
    pop= []
    for i in np.arange(pop_size):
        gates = []
        for input_ids, output_ids in zip(agent.input_ids, agent.output_ids):
            gate = np.zeros((2**len(input_ids), 2**len(output_ids)))
            row_max_indices = np.random.randint( 2**len(output_ids), size = 2**len(input_ids) )
            #row_max_indices = np.argmax(gate, axis=1)
            gate[np.arange(len(row_max_indices)), row_max_indices] = 1
            
            gates.append(gate)
        pop.append(gates)

    return np.array(pop)

def mutate_deter(child):
    gates = child
    for gate in gates:
        dimx, dimy = gate.shape
        indx, indy = np.where(gate==1)
        indy[np.random.randint(len(indy), size = 8) ] = np.random.randint(dimy, size = 8)
        gate = np.zeros((dimx, dimy))
        gate[indx, indy] = 1
    child = gates
    return child



def init_pop_agent(pop_size, agent):
    pop= []
    for i in np.arange(pop_size):
        gates = []
        for input_ids, output_ids in zip(agent.input_ids, agent.output_ids):
            gate = np.random.rand(2**len(input_ids), 2**len(output_ids))
            gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]
            gates.append(gate)
        pop.append(gates)

    return np.array(pop)

        

def mutate(child):
    gates = child
    for gate in gates:
        gate = gate + np.random.rand()
        gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]
    child = gates
    return child

    
            
    
def select(fitness, k):    # nature selection wrt pop's fitness
    #idx = np.random.choice(np.arange(pop_size), size=pop_size-k, replace=True,
    #                       p=(fitness_pop+1)/(fitness_pop+1).sum())
    fitness_pop = np.copy(fitness)
    count = np.zeros(pop_size)
    ret_idx = np.zeros(pop_size-k, dtype = int)
    for i in np.arange(pop_size-k):
        idx = np.random.choice(np.arange(pop_size), size=1, replace=True,
                           p=(fitness_pop+1)/(fitness_pop+1).sum())
        ret_idx[i] = idx
        count[idx] += 1
        if count[idx]>=2:
            fitness_pop[idx]=0
    
    return ret_idx






            

def evolution():
    # initialize the maze environment
    maze_width = 10
    maze_length = 50
    maze = Maze(maze_width, maze_length)
    agent = Agent(maze)
    agent.input_ids =  agent.best_input_ids
    agent.output_ids = agent.best_output_ids
    
    
    
    # initialize the agent population
    pop = init_pop_agent_deter(pop_size, agent)
    
    maze.print_maze()
        
    
    # save statistics
    generation_trend = np.zeros(num_generation)
    elite_trend = np.zeros(num_generation)
    generation_trend_meanfit = np.zeros(num_generation)
    elite_trend_meanfit = np.zeros(num_generation)
    
    generation_pop_allfit = np.zeros((num_generation, pop_size, fit_times))
    generation_pop_fit = np.zeros((num_generation, pop_size))

    
    # evole
    for i_ in np.arange(num_generation):
        print(".", end="")
        if i_ % 50 == 0 and i_>0: print(" ")
    
        # eval the population fitness
        fitness_pop = np.zeros(pop_size, dtype = np.float32)
        fitness_pop_meanfit = np.zeros(pop_size, dtype = np.float32)
        
        #================1=================
        # Start evolution & evaluate fitness
        for ind in np.arange(len(pop)):
            agent.gates = pop[ind]
            #print(agent.Gate_control)
            fitness_tmp = np.zeros(fit_times)
            for repeat_fit in np.arange(fit_times):
                agent.simple_reinit()
                while(agent.end==False):
                    agent.brain_update()
                    #print(agent.brain, agent.position)
                    fitness_once,r = agent.step()
                fitness_tmp[repeat_fit] = fitness_once
                
            
            
            fitness = np.prod(fitness_tmp)**(1/fit_times) * 100
            #print(fitness)
            fitness_pop[ind] = fitness
            agent.fitness = fitness
            
            generation_pop_fit[i_,ind] = fitness
            
            fitness_meanfit = np.mean(fitness_tmp)
            fitness_pop_meanfit[ind] = fitness_meanfit

        
        #print(fitness_pop)

        #================2=================
        # keep k elites without mutation
        elite_idx = np.argpartition(fitness_pop, -k)[-k:]
    
        elites = copy.deepcopy(pop[elite_idx])
        #print("elite_idx ", elite_idx)

        
        #================3=================
        # select & mutation w.r.t. non-elites
        #others_idx = np.argpartition(fitness_pop, -k)[:-k]
        #print("others_idx ", others_idx)
        #sort_fit = np.concatenate((fitness_pop[elite_idx], fitness_pop[others_idx]))
        #print(sort_fit)
        
        idx = select(fitness_pop, k)
        other_pop = copy.deepcopy(pop[idx])

        
        #================4=================
        # save data 
        elite_trend[i_] = np.mean(fitness_pop[elite_idx])
        elite_trend_meanfit[i_] = np.mean(fitness_pop_meanfit[elite_idx])
        generation_trend[i_] = np.mean(fitness_pop)
        generation_trend_meanfit[i_] = np.mean(fitness_pop_meanfit)
        
        if i_ > 0 and i_%100==0 or i_ == num_generation-1:
            with open("./save_model/pop"+str(i_)+".pickle","wb") as f:
                pickle.dump(pop, f)
            with open("./save_model/elites"+str(i_)+".pickle","wb") as f:
                pickle.dump(elites, f)
                
                
            with open("./save_model/generation_trend.pickle","wb") as f:
                pickle.dump(generation_trend, f)
            with open("./save_model/elite_trend.pickle","wb") as f:
                pickle.dump(elite_trend, f)
            with open("./save_model/generation_trend_meanfit.pickle","wb") as f:
                pickle.dump(generation_trend_meanfit, f)
            with open("./save_model/elite_trend_meanfit.pickle","wb") as f:
                pickle.dump(elite_trend_meanfit, f)


            with open("./save_model/generation_pop_allfit.pickle","wb") as f:
                pickle.dump(generation_pop_allfit, f)
            with open("./save_model/generation_pop_fit.pickle","wb") as f:
                pickle.dump(generation_pop_fit, f)

                
                
        #================5=================     
        # mutate & cross_over
        for mut_ind, parent in enumerate(other_pop):
            #if np.random.rand()<cross_rate:
            #    cross_over(parent, pop)
            mutate_deter(parent)
            
         
        #new_idx = np.array(np.concatenate((elite_idx, idx)))
        #pop = pop[new_idx]
        pop = np.concatenate((elites, other_pop), axis=0)
    
            
        
        
        #================6=================
        # generate a new maze & reset agents
        #if i_>100 and i_ %100==0:
        #   print("a new maze after ", i_)
        #    maze = Maze(maze_width, maze_length)
        #    agent.maze = maze
                
    
    

    
    #================7=================        
    # after evolution plot the results
    plt.title('Evolution Trend')
    plt.plot(np.arange(num_generation), generation_trend, color='green', label='Generation gmean trend',linestyle = ':')
    plt.plot(np.arange(num_generation), elite_trend, color='red', label='Elite gmean trend',linestyle='-.')
    #plt.plot(np.arange(num_generation), generation_trend_meanfit, color='blue', label='Generation mean trend',linestyle = ':')
    #plt.plot(np.arange(num_generation), elite_trend_meanfit, color='black', label='Elite mean trend',linestyle='-.')
    plt.xlabel("Evolution times")
    plt.ylabel("fitness (number of maze passes)")
    plt.legend() # 显示图例
    plt.show()


print("Running")
evolution()

print("End")


        


    


          

In [None]:
#=========================utils===========================================================
def calculate_discout_reward(rewards, gamma):
    discounted_reward = []
    cumulative_sum = 0
    for i, r in enumerate(reversed(rewards)):
        cumulative_sum = cumulative_sum*gamma + r
        discounted_reward.append(cumulative_sum)
    return discounted_reward[::-1]

def calculate_discout_reward_window(reward, gamma, length=3):
    target_discount_reward = []
    convolution_filter = [gamma**i for i in range(length)]
    return np.convolve(reward, convolution_filter, 'valid')



#=========================Agent===========================================================
def collect_experience(env, agent, number_action=8):

    observations, rewards, is_not_done = [], [], []
    action_probs = []
    action_taken_for_memory, action_taken_for_control = [],[]
    
    agent.perception()
    obs = np.copy(agent.brain)
    
    while (agent.end == False):        
        
        obs_for_memory = obs[agent.memory_index]
        out_for_memory = agent.Gate_memory(
            tf.expand_dims(
                tf.convert_to_tensor(obs_for_memory, dtype=tf.float32), axis=0
            )
        )

        if isinstance(out_for_memory, tuple):
            prob_for_memory, _ = out_for_memory
        else:
            prob_for_memory = out_for_memory

        action_prob_for_memory = tf.nn.softmax(prob_for_memory)

        # Sample
        samples_for_memory = tf.random.multinomial(tf.log(action_prob_for_memory), 1).numpy()[0][0]

        action_for_memory = np.zeros(2)
        action_for_memory[samples_for_memory] = 1


        sample_str = bin(samples_for_memory)
        sample = np.zeros(1, dtype=np.int)
        for i, v in enumerate(sample_str[2:]):
            sample[i]=int(v)
        agent.brain[6] = sample
        
        #==============================================
        
        obs_for_control = obs[agent.control_index]
        out_for_control = agent.Gate_control(
            tf.expand_dims(
                tf.convert_to_tensor(obs_for_control, dtype=tf.float32), axis=0
            )
        )

        if isinstance(out_for_control, tuple):
            prob_for_control, _ = out_for_control
        else:
            prob_for_control = out_for_control

        action_prob_for_control = tf.nn.softmax(prob_for_control)

        # Sample
        samples_for_control = tf.random.multinomial(tf.log(action_prob_for_control), 1).numpy()[0][0]

        action_for_control = np.zeros(4)
        action_for_control[samples_for_control] = 1


        sample_str = bin(samples_for_control)
        sample = np.zeros(2, dtype=np.int)
        for i, v in enumerate(sample_str[2:]):
            sample[i]=int(v)
        agent.brain[7:] = sample
        
        
        

        _,r = agent.step()
        agent.perception()
        next_obs = np.copy(agent.brain)

        observations.append(obs)
        rewards.append(r)
        action_taken_for_memory.append(action_for_memory)
        action_taken_for_control.append(action_for_control)


        obs = next_obs
            
    observations.append(obs)

    return np.stack(observations), np.stack(rewards), np.stack(action_taken_for_memory), np.stack(action_taken_for_control)

            
            
#=========================main===========================================================
logger = logging.getLogger(os.path.basename(sys.argv[0]))

def main():
    """
    # -------------------- * --------------------
    argparser = argparse.ArgumentParser('PG', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    argparser.add_argument('--gamma', action="store", type=float, default=0.99)
    argparser.add_argument('--learning-rate', action="store", type=float, default=8e-4)
    argparser.add_argument('--episode-train', action="store", type=int, default=500)

    argparser.add_argument('--output-save-img', action="store", type=str, default=None)

    args = argparser.parse_args(argv)
    gamma = args.gamma
    learning_rate = args.learning_rate
    episode_train = args.episode_train
    output_save_img = args.output_save_img
    # -------------------- * --------------------
    """
    
    gamma =  0.99
    episode_train = 50
    learning_rate = 1e-3

    maze = Maze(10,50)
    agent = Agent(maze)
    
    """checkpoint_directory = "./tfmodel"
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt-1")
    checkpoint = tf.train.Checkpoint(Gate=agent.Gate)   
    checkpoint.restore(checkpoint_prefix)"""

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    average_reward, reward_curve = 0, []
    
    for sample_rate_update_ind in range(10):
        print("episode ", sample_rate_update_ind)
    #===================sample N=5 times for updating nodes rates========================================
        nodes_for_memory, nodes_for_control, rewards = [], [], []
        # repeat to get stable gradients for node rates
        for sample_rate_ind in range(50):
        #training an agent 
            for rpt in range(1): # sample number of mazes
                maze = Maze(10,50)
                agent.maze = maze
                print(agent.maze.average_reward)
                for eps in range(episode_train): # train on a maze

                    agent.simple_reinit()
                    observation_rollout, reward_rollout, action_rollout_for_memory, action_rollout_for_control = collect_experience(agent.maze, agent)
                    discounted_reward_rollout = calculate_discout_reward(reward_rollout, gamma)

                    # Remove last observation
                    observation_rollout = observation_rollout[:-1]

                    with tf.GradientTape() as tape:
                        tape.watch(agent.Gate_memory.variables)

                        all_prob = tf.log(tf.nn.softmax(agent.Gate_memory(
                            tf.convert_to_tensor(observation_rollout[:,agent.memory_index], dtype=tf.float32)
                        )))

                        all_prob_masked = tf.reduce_sum(action_rollout_for_memory *  all_prob, axis=-1)
                        loss = tf.reduce_sum(all_prob_masked * discounted_reward_rollout * -1)
                    grad = tape.gradient(loss, agent.Gate_memory.variables)
                    optimizer.apply_gradients(zip(grad, agent.Gate_memory.variables))

                    with tf.GradientTape() as tape:
                        tape.watch(agent.Gate_control.variables)

                        all_prob = tf.log(tf.nn.softmax(agent.Gate_control(
                            tf.convert_to_tensor(observation_rollout[:, agent.control_index], dtype=tf.float32)
                        )))

                        all_prob_masked = tf.reduce_sum(action_rollout_for_control *  all_prob, axis=-1)
                        loss_c = tf.reduce_sum(all_prob_masked * discounted_reward_rollout * -1)
                    grad_c = tape.gradient(loss_c, agent.Gate_control.variables)
                    optimizer.apply_gradients(zip(grad_c, agent.Gate_control.variables))


                    average_reward += np.mean(reward_rollout)

                    if eps%5 == 0 and eps > 0:
                        print("Currently in episode %d and the average reward is %f" % (eps, average_reward))
                        #logger.info("Currently in episode {eps} and the average reward is {average_reward}" )
                        reward_curve.append(average_reward)
                        average_reward = 0


                nodes_for_memory.append(agent.memory_index)
                nodes_for_control.append(agent.control_index)
                rewards.append(average_reward)

            #====================resampling the input nodes=========================================================
            agent.memory_index = [] # np.array([], dtype = np.int)
            for ind in np.arange(agent.brain_size):
                if np.random.binomial(1, agent.memory_rate[ind]):
                    agent.memory_index.append(ind)
                    # np.append(self.memory_index, ind)
            agent.memory_index = np.array(agent.memory_index, dtype=np.int)

            agent.control_index = []
            for ind in np.arange(agent.brain_size):
                if np.random.binomial(1, agent.control_rate[ind]):
                    agent.control_index.append(ind)
            agent.control_index = np.array(agent.control_index, dtype=np.int)    

            agent.Gate_memory = NeuroGate_memory(len(agent.memory_index))
            agent.Gate_control = NeuroGate_control(len(agent.control_index))

        #=========================update nodes sampling rate===============================    
        memory_nodes_rewards = np.zeros(agent.brain_size)
        control_nodes_rewards = np.zeros(agent.brain_size)
        for val, rewards_val in zip(nodes_for_memory, rewards):
            for nodes_val in val:
                memory_nodes_rewards[nodes_val] += rewards_val
        for val, rewards_val in zip(nodes_for_control, rewards):
            for nodes_val in val:
                control_nodes_rewards[nodes_val] += rewards_val

        memory_nodes_rewards = np.exp(memory_nodes_rewards)
        control_nodes_rewards = np.exp(control_nodes_rewards)
        agent.memory_rate = memory_nodes_rewards/memory_nodes_rewards.sum()
        agent.control_rate = control_nodes_rewards/control_nodes_rewards.sum()
        
        print(agent.memory_rate)
        print(agent.control_rate)
        
        

    #sns.lineplot(y=reward_curve, x=list(range(len(reward_curve))))
    plt.plot(range(len(reward_curve)), reward_curve)
    plt.show()
    #plt.savefig(output_save_img)
    
    checkpoint_directory = "./tfmodel"
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
    checkpoint = tf.train.Checkpoint(Gate_memory=agent.Gate_memory, Gate_control = agent.Gate_control)
    checkpoint.save(file_prefix=checkpoint_prefix)
    #status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_path))

#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#print(' '.join(sys.argv))

main()

In [None]:
def test():        
    maze = Maze(10,40)
    maze.print_maze()
 
    agent = Agent_circular(maze)                    
    checkpoint_directory = "./tfmodel"
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt-1")
    checkpoint = tf.train.Checkpoint(Gate=agent.Gate)   
    checkpoint.restore(checkpoint_prefix)

  
    agent.perception()
    obs = agent.brain
    while(agent.end == False):
        
        out = agent.Gate(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        action_prob = tf.nn.softmax(out)

        # Sample
        samples = tf.random.multinomial(tf.log(action_prob), 1).numpy()[0][0]

        action = np.zeros((64))
        action[samples] = 1


        sample_str = bin(samples)
        sample = np.zeros(3, dtype=np.int)
        for i, v in enumerate(sample_str[2:]):
            sample[i]=int(v)
        agent.brain[6:] = sample
        print(agent.brain)
        
        _,r = agent.step()
        agent.perception()
        next_obs = np.copy(agent.brain)
        
        obs = next_obs
    
    print(agent.fitness)
    print(agent.pass_maze)
    print(agent.trajectory)
    
        
        
        
    
  

#np.random.seed(9)
test()


search each gate by adding-on, i.e. search for the first best gate, then fix this gate and search the second, and so on.

In [None]:
maze = Maze(10,50)
agent = Agent(maze)
num_gates = 4
num_input = 4
num_output = 3

inputs = []
node = [0,1,2,3,4,5,7,8]
for i in range(len(node)):
    for j in range(i, len(node)):
        for k in range(j, len(node)):
                inputs.append([node[i],node[j],node[k], 6])
inputs = np.stack(inputs)

outputs = []
out = [6,7,8]
for i in range(len(out)):
    for j in range(i,len(out)):
        for k in range(j,len(out)):
            outputs.append([out[i],out[j],out[k]])
outputs = np.stack(outputs)

print(len(inputs))
print(len(outputs))


fitness = np.zeros((len(inputs), len(outputs)))
for i in range(len(inputs)):
        print(".", end="")
        if i>0 and i % 100 == 0: print(" ")
        for j in range(len(outputs)):
            input_ids = [[4,7,8,6],[3],[0,3,4,6]]
            output_ids = [[6,7,8],[6],[6,8]]
            gates = []
            for in_ids, out_ids in zip(input_ids, output_ids):
                gate = np.random.rand(2**len(in_ids), 2**len(out_ids))
                gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]
                gates.append(gate)

            input_ = np.unique(inputs[i])
            output_ = np.unique(outputs[j])

            gate = np.random.rand(2**len(input_), 2**len(output_))
            gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]

            input_ids.append(input_)
            output_ids.append(output_)
            gates.append(gate)

            agent.input_ids=input_ids
            agent.output_ids=output_ids
            agent.gates=gates

            agent.simple_reinit()
            while (agent.end == False):
                #maze.print_maze(agent.position[0], agent.position[1])
                agent.brain_update()
                #print(agent.brain, agent.position)
                fit,r = agent.step()
            fitness[i,j] = fit
            
sns.heatmap(fitness)       





search by fixing  3,6 as inputs, then search all gates with grid search

In [None]:
maze = Maze(10,50)
agent = Agent(maze)
num_gates = 4
num_input = 4
num_output = 3

inputs = []
node = [0,1,2,4,5,7,8]
for i in range(len(node)):
    for j in range(i, len(node)):
                inputs.append([node[i],node[j],3, 6])
inputs = np.stack(inputs)

outputs = []
out = [6,7,8]
for i in range(len(out)):
    for j in range(i+1,len(out)):
        for k in range(j+1,len(out)):
            outputs.append([out[i],out[j],out[k]])
outputs = np.stack(outputs)

print(len(inputs))
print(len(outputs))

"""
fitness = np.zeros(( len(inputs), len(outputs)))
for i in range(len(inputs)):
        print(".", end="")
        if i>0 and i % 100 == 0: print(" ")
        for j in range(len(outputs)):
            input_ids = []
            output_ids = []
            gates = []

            input_ = np.unique(inputs[i])
            output_ = np.unique(outputs[j])

            gate = np.random.rand(2**len(input_), 2**len(output_))
            gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]

            input_ids.append(input_)
            output_ids.append(output_)
            gates.append(gate)

            agent.input_ids=input_ids
            agent.output_ids=output_ids
            agent.gates=gates

            agent.simple_reinit()
            while (agent.end == False):
                #maze.print_maze(agent.position[0], agent.position[1])
                agent.brain_update()
                #print(agent.brain, agent.position)
                fit,r = agent.step()
            fitness[i,j] = fit
            
sns.heatmap(fitness)       
"""


fitness = np.zeros((len(inputs), len(inputs), len(inputs)))
for i in range(len(inputs)):
    print(".", end="")
    if i>0 and i % 100 == 0: print(" ")
    for j in range(len(outputs)):
        for i1 in range(i, len(inputs)):
            for j1 in range(j, len(outputs)):
                for i2 in range(i1, len(inputs)):
                    for j2 in range(j1,len(outputs)):
                        
                

                        input_ = np.unique(inputs[i])
                        input_1 = np.unique(inputs[i1])
                        input_2 = np.unique(inputs[i2])
                        output_ = np.unique(outputs[j])
                        output_1 = np.unique(outputs[j1])
                        output_2 = np.unique(outputs[j2])
                        
                        gate = np.random.rand(2**len(input_), 2**len(output_))
                        gate = gate.astype(np.float64) / np.sum(gate, axis=1, dtype=np.float64)[:, None]
                        gate1 = np.random.rand(2**len(input_1), 2**len(output_1))
                        gate1 = gate1.astype(np.float64) / np.sum(gate1, axis=1, dtype=np.float64)[:, None]
                        gate2 = np.random.rand(2**len(input_2), 2**len(output_2))
                        gate2 = gate2.astype(np.float64) / np.sum(gate2, axis=1, dtype=np.float64)[:, None]

                        
                        input_ids = []
                        output_ids = []
                        gates = []
                        
                        input_ids.append(input_)
                        input_ids.append(input_1)
                        input_ids.append(input_2)
                        
                        output_ids.append(output_)
                        output_ids.append(output_1)
                        output_ids.append(output_2)
                        
                        gates.append(gate)
                        gates.append(gate1)
                        gates.append(gate2)

                        agent.input_ids=input_ids
                        agent.output_ids=output_ids
                        agent.gates=gates

                        agent.simple_reinit()
                        while (agent.end == False):
                            #maze.print_maze(agent.position[0], agent.position[1])
                            agent.brain_update()
                            #print(agent.brain, agent.position)
                            fit,r = agent.step()
                        fitness[i,i1,i2] = fit

sns.heatmap(fitness)


a= np.amax(fitness, 0)
sns.heatmap(a)
with open("./fitness.pickle","wb") as f:
    pickle.dump(fitness, f)
    
print(np.where(fitness == np.max(fitness)))
print(inputs[6])
print(inputs[8])
print(inputs[14])