In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import os, sys
import random
import pickle
import argparse
import logging
import tensorflow as tf
tf.enable_eager_execution()

np.random.seed(5)
tf.random.set_random_seed(5)
random.seed(5)

np.set_printoptions(precision=2, threshold=np.inf)

class Maze(object):
    WALL = 2
    EMPTY = 8
    LEFT = 0
    RIGHT = 1 # right or forward
    def __init__(self, width, length): 
        self.length = length
        self.width = width
        self.maze = np.ones((self.width, self.length)) * Maze.WALL

        self.generate_maze()
        
        #self.maze_mask
        #self.shortest_solutions
        self.get_shortest_solutions()
        
        #self.longest_shortest, used to calculate objective value
        self.get_longest_shortest_solutions()
        
        # used to normalize objective value
        self.best_score = self.get_attainable_score()

        
        
        
    
    def generate_maze(self):
        # generate walls, doors
        
        spaces = np.random.randint(low=1, high=4, size=self.length)
        cum_spaces = np.cumsum(spaces) # leave the first col empty
 
        for ind, val in enumerate(cum_spaces):
            if val >= self.length-1:
                self.wall_position = cum_spaces[:ind]
                break
        if self.wall_position[0] > 1:
            self.wall_position[0] = 1
        if self.wall_position[-1] < self.length-1:
            self.wall_position = np.append(self.wall_position, self.length-1)
                
        self.road_position = np.array([]).astype(np.int)
        for ind in np.arange(self.length-1):
            if ind not in self.wall_position:
                self.road_position = np.append(self.road_position, ind)
        
        for i in self.road_position:
            self.maze[1:-1,i]=Maze.EMPTY
        
        self.door_position = np.random.randint(low=1, high=self.width-1, size=len(self.wall_position))
        #print(self.door_position)
    
        # get door position
        self.door_position = np.zeros(len(self.wall_position), dtype = np.int)
        self.door_position[-1] = np.random.randint(low=1, high=self.width-1) #1~self.width-2 available door position
        for ind in np.arange(len(self.wall_position)-2, -1, -1):
            if self.wall_position[ind] == self.wall_position[ind+1] -1: # two walls together
                self.door_position[ind] = self.door_position[ind+1]
                
            else:
                self.door_position[ind] = np.random.randint(low=1, high=self.width-1)
        
        # Fill door cue
        self.maze[ self.door_position[-1], self.wall_position[-1] ] = Maze.RIGHT # default last door due
        for i in np.arange(len(self.wall_position)-2, -1, -1):
            if self.door_position[i+1] < self.door_position[i]:
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.LEFT
            else: 
                self.maze[self.door_position[i], self.wall_position[i]] = Maze.RIGHT
                
                
                
       
                
    def print_maze(self, x=-1, y=-1):
        if x>=0 and y>=0:
            tmp = self.maze[x,y]
            self.maze[x,y] = -1 # position of the agent
            
        print("  ", end="")    
        #for i in np.arange(self.length):
        #    print('%d ' % i, end='')
        print("\n")
        
        for j in np.arange(self.width):
            print('%d ' % j, end='')
            for i in np.arange(self.length):
            
                if self.maze[j,i]==Maze.WALL: # wall position
                    print('H ',end='')
                elif self.maze[j,i]==Maze.EMPTY:
                    print('  ',end='')# road
                elif self.maze[j,i]==-1:
                    print('T ',end='')
                    self.maze[x,y]= tmp
                else:
                    print('%d ' % self.maze[j,i], end='')
            print('\n')

        
    def get_shortest_solutions(self):
        # get the shortest length to the end of maze from each layer
        
        self.maze_mask = np.zeros(self.length, dtype=np.int)
        for ind, val in enumerate(self.wall_position):
            self.maze_mask[val] = self.door_position[ind]
       
        self.shortest_solutions = np.zeros(self.length, dtype=np.int)
        step = 0
        next_wall = self.length-1
        for ind in np.arange(self.length-2, -1, -1):
            if self.maze_mask[ind] == 0: # road
                step += 1
                self.shortest_solutions[ind] = step
            else: # wall
                step += np.abs(self.maze_mask[next_wall] - self.maze_mask[ind])+1 #1 out the door, +diff for vert.
                self.shortest_solutions[ind] = step
                next_wall = ind
        

    
    def get_distance_escape(self, x, y):
        # get the shortest distance to escape from the current position
        vertical_distance = 0
        if y in self.road_position:
            for next_wall_ind in np.arange(y+1, y+4, 1):
                if next_wall_ind in self.wall_position: break
            vertical_distance = np.abs(self.maze_mask[next_wall_ind] - x)
        return self.shortest_solutions[y]+vertical_distance
                

        
    def get_longest_shortest_solutions(self):
        # get the shortest length from corner of starting to the end out maze
        left = self.get_distance_escape(1,0)
        right = self.get_distance_escape(self.width-2,0)
        
        self.longest_shortest = np.maximum(left, right)+5 # higher than true value
    
    
    def get_attainable_score(self):
        position = []
        x = self.door_position[0] # in front of the first door
        y = 0
        score = np.float32(0)
        pass_maze = 0
        door_signal=self.maze[self.door_position[0], 1]
        for _ in np.arange(Agent_circular.LIFE-1, -1, -1):
            position.append([x,y])
            if y != self.length-1:
                score += (self.longest_shortest - self.get_distance_escape(x,y) )/self.longest_shortest + pass_maze
            if self.maze[x, y+1]!=Maze.WALL: # road
                y += 1
                if y in self.wall_position:
                    door_signal = self.maze[x,y]
                if y == self.length-1:
                    pass_maze += 1
                    y=0
            else: # wall
                if door_signal == 0 and self.maze[x-1,y]==Maze.WALL: # init location make door signal no more signal
                    door_signal = 1
                if door_signal == 1 and self.maze[x+1,y]==Maze.WALL:
                    door_signal = 0
                x += int(door_signal*2-1)
        
        #print(position)
     
        return score


    
#=========================models===========================================================
class NeuroGate(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.func = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(12,)),
            tf.keras.layers.Dense(units=64, activation=tf.nn.sigmoid),
            tf.keras.layers.Dense(units=2)
        ])

    def call(self, obs):
        return self.func(obs)
                   
class Agent_circular:
    LIFE = 3000
    brain_size=12
    def __init__(self, maze):
        
        self.maze = maze
        self.brain_size = Agent_circular.brain_size
        self.brain = np.zeros(self.brain_size)
        self.score = np.float32(0)
        
        self.Gate6 = NeuroGate()
        self.Gate7 = NeuroGate()
        self.Gate8 = NeuroGate()
        self.Gate9 = NeuroGate()
        self.Gate10 = NeuroGate()
        self.Gate11 = NeuroGate()
    
        
        self.end = False # reach the end of maze
        self.time_step = 0 # +1 for every move
        self.thinking_times = 0 # +1 for every step
        #self.life = np.maximum(300, 10*self.maze.length)
        self.life = Agent_circular.LIFE
        self.pass_maze = 0
        
        #self.position = np.array([self.maze.door_position[0], 0]) # in front of the first door
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.trajectory = np.ones((self.life, 2))*-1
        self.trajectory[self.time_step,:] = self.position
        
        self.door_direction()
        self.perception()
        

        
    # reinit when the genome has no changes, used in fitness evaluation
    def simple_reinit(self):
        
        #self.brain[:6] = 0
        #self.brain[10:]=0 # keep hidden nodes' state
        self.brain = np.zeros(self.brain_size)
        self.score = np.float32(0)
 
        self.end = False # reach the end of maze
        self.time_step = 0 # +1 for every move
        self.thinking_times = 0 # +1 for every step
        #self.life = np.maximum(300, 10*self.maze.length)
        self.life = Agent_circular.LIFE
        self.pass_maze = 0
        
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.trajectory = np.ones((self.life, 2))*-1
        self.trajectory[self.time_step,:] = self.position
        
        self.door_direction()
        self.perception()

    
        
        

        
        
    def init_locate(self):
        # if the agent reaches the end of maze, pull it back to the origin
        
        #self.position = np.array([np.random.choice(np.arange(1,self.maze.width-1)), 0])
        self.position = np.array([self.maze.door_position[-1], 0]) # in front of the last door
        self.end = False
    
        self.brain[:6] = 0 # reset brain
        self.brain[10:]=0 # keep hidden nodes' state
        
        self.door_direction()
        self.perception()
    

        
    def door_direction(self):
        # let the agent know the first door's position
        pass
        """
        next_wall = self.maze.wall_position[0] # the first wall
        left = self.maze.maze[1:self.position[0], next_wall]
        right = self.maze.maze[self.position[0]:self.maze.width-1, next_wall]
        
        for land in left:
            if land != Maze.WALL: 
                self.brain[3] = 0
                break
        for land in right:
            if land != Maze.WALL: 
                self.brain[3] = 1
                break
        """
                
    def perception(self):
        x,y = self.position
        #print("x=%d, y=%d", (x,y))
        # reset agent's input before set new values
        #self.brain[0:3] = 0
        #self.brain[4:6] =0
        self.brain[:6]=0
        
        if self.maze.maze[x,y+1] == Maze.WALL:
            self.brain[0]=1
        else: self.brain[0]=0
        
        if self.maze.maze[x-1,y+1] == Maze.WALL:
            self.brain[1]=1
        else: self.brain[1]=0
        
        if self.maze.maze[x+1,y+1] == Maze.WALL:
            self.brain[2] = 1
        else: self.brain[2]=0
        
        if self.maze.maze[x-1,y] == Maze.WALL:
            self.brain[4]=1
        else: self.brain[4]=0
        
        if self.maze.maze[x+1,y] == Maze.WALL:
            self.brain[5]=1
        else: self.brain[5]=0
        
        if y in self.maze.wall_position:
            self.brain[3] = self.maze.maze[x, y]
        

            
    
    def step(self):
        x,y = self.position
        r = (self.maze.longest_shortest - self.maze.get_distance_escape(x,y))/self.maze.longest_shortest + self.pass_maze
        self.score +=  r
        #print("x=%d, y=%d, escape_distance=%d, score=%f " % (x,y,agent.maze.get_distance_escape(x,y), agent.score))
        #print("value=%f ", (agent.maze.longest_shortest - agent.maze.get_distance_escape(x,y))/agent.maze.longest_shortest)
        
        
        fitness = 0
        time_step_shot = self.time_step
        self.thinking_times = self.thinking_times + 1
        # print("time_step:%d" % self.time_step)
        # print("thinking time: %d" % self.thinking_times)
        if self.thinking_times>self.life-1:# or self.thinking_times >= 3000: 
            self.end = True
            fitness = self.get_fitness()
            self.fitness = fitness
            
        elif self.brain[10] == 1 and self.brain[11] == 0:
            #if self.maze.maze[x+1,y]==Maze.WALL:
            #    self.brain[10] = 0
            #    self.brain[11] = 1
            #else:
            if  self.maze.maze[x+1,y] != Maze.WALL:
                self.position = x+1, y
                self.time_step = self.time_step+1
        elif self.brain[10] == 0 and self.brain[11] == 1:
            #if self.maze.maze[x-1,y] == Maze.WALL:
            #    self.brain[10] = 1
            #    self.brain[11] = 0
            #else:
            if  self.maze.maze[x-1,y] != Maze.WALL:
                self.position = x-1, y
                self.time_step = self.time_step+1
                
        elif self.brain[10] == 1 and self.brain[11] == 1:
            if self.maze.maze[x,y+1] != Maze.WALL:
                self.position = x,y+1
                self.time_step = self.time_step+1
            """
            elif y in self.maze.wall_position: # in a door
                self.position = x,y+1
                self.time_step = self.time_step+1
            elif y+1 in self.maze.wall_position and self.maze.maze[x,y+1]!=2: # before a door
                #print('before a door >;<')
                self.position = x,y+1
                self.time_step = self.time_step+1
            """
            x,y = self.position
            if y == self.maze.length-1: # reach the end of the maze
                self.pass_maze = self.pass_maze + 1
                self.init_locate()
            
        elif self.brain[10] == 0 and self.brain[11] == 0:
            self.position = x,y
            self.time_step = self.time_step+1
            '''else:
                # shouldn't have this
                self.brain[10] = 1
                self.brain[11] = 0
            '''    
        '''elif self.brain[10] == 0 and self.brain[11] == 0:
            self.brain[10] = 0
            self.brain[11] = 1
        ''' 
        
        # if the brain's order is legal, keep it
        # illegal order is omitted
        if self.time_step > time_step_shot:    
            self.trajectory[self.time_step,:] = self.position
        
        return fitness, r
    
    def get_fitness(self):
        
        return self.score/self.maze.best_score 
    
    
        
        
    

    
        

        
                
    
#np.random.seed(9)
#import cProfile
#cProfile.run('test()')

        
                
            
    
        

In [None]:
#=========================utils===========================================================
def calculate_discout_reward(rewards, gamma):
    discounted_reward = []
    cumulative_sum = 0
    for i, r in enumerate(reversed(rewards)):
        cumulative_sum = cumulative_sum*gamma + r
        discounted_reward.append(cumulative_sum)
    return discounted_reward[::-1]

def calculate_discout_reward_window(reward, gamma, length=3):
    target_discount_reward = []
    convolution_filter = [gamma**i for i in range(length)]
    return np.convolve(reward, convolution_filter, 'valid')



#=========================Agent===========================================================
def collect_experience(env, agent, number_action=2):

    observations, rewards, is_not_done = [], [], []
    action_taken6, action_taken7, action_taken8, action_taken9,action_taken10, action_taken11=[],[],[],[],[],[]
    action_probs = []
    
    agent.perception()
    obs = np.copy(agent.brain)
    
    while (agent.end == False):        

        out6 = agent.Gate6(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        out7 = agent.Gate7(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        out8 = agent.Gate8(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        out9 = agent.Gate9(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        out10 = agent.Gate10(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        out11 = agent.Gate11(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )

        if isinstance(out6, tuple):
            prob6, _ = out6
        else:
            prob6 = out6
        if isinstance(out7, tuple):
            prob7, _ = out7
        else:
            prob7 = out7
        if isinstance(out8, tuple):
            prob8, _ = out8
        else:
            prob8 = out8
        if isinstance(out9, tuple):
            prob9, _ = out9
        else:
            prob9 = out9
        if isinstance(out10, tuple):
            prob10, _ = out10
        else:
            prob10 = out10
        if isinstance(out11, tuple):
            prob11, _ = out11
        else:
            prob11 = out11

        action_prob6 = tf.nn.softmax(prob6)
        action_prob7 = tf.nn.softmax(prob7)
        action_prob8 = tf.nn.softmax(prob8)
        action_prob9 = tf.nn.softmax(prob9)
        action_prob10 = tf.nn.softmax(prob10)
        action_prob11 = tf.nn.softmax(prob11)

        # Sample
        samples6 = tf.random.multinomial(tf.log(action_prob6), 1).numpy()[0][0]
        samples7 = tf.random.multinomial(tf.log(action_prob7), 1).numpy()[0][0]
        samples8 = tf.random.multinomial(tf.log(action_prob8), 1).numpy()[0][0]
        samples9 = tf.random.multinomial(tf.log(action_prob9), 1).numpy()[0][0]
        samples10 = tf.random.multinomial(tf.log(action_prob10), 1).numpy()[0][0]
        samples11 = tf.random.multinomial(tf.log(action_prob11), 1).numpy()[0][0]
        

        action6 = np.zeros((number_action))
        action6[samples6] = 1
        action7 = np.zeros((number_action))
        action7[samples7] = 1
        action8 = np.zeros((number_action))
        action8[samples8] = 1
        action9 = np.zeros((number_action))
        action9[samples9] = 1
        action10 = np.zeros((number_action))
        action10[samples10] = 1
        action11 = np.zeros((number_action))
        action11[samples11] = 1


        """sample_str = bin(samples)
        sample = np.zeros(6, dtype=np.int)
        for i, v in enumerate(sample_str[2:]):
            sample[i]=int(v)
        agent.brain[6:] = sample"""
        agent.brain[6] = samples6
        agent.brain[7] = samples7
        agent.brain[8] = samples8
        agent.brain[9] = samples9
        agent.brain[10] = samples10
        agent.brain[11] = samples11

        _,r = agent.step()
        agent.perception()
        next_obs = np.copy(agent.brain)

        observations.append(obs)
        rewards.append(r)
        action_taken6.append(action6)
        action_taken7.append(action7)
        action_taken8.append(action8)
        action_taken9.append(action9)
        action_taken10.append(action10)
        action_taken11.append(action11)


        obs = next_obs
            
    observations.append(obs)

    return np.stack(observations), np.stack(rewards), np.stack(action_taken6), np.stack(action_taken7), np.stack(action_taken8), np.stack(action_taken9), np.stack(action_taken10), np.stack(action_taken11)
            

           
#=========================main===========================================================
logger = logging.getLogger(os.path.basename(sys.argv[0]))

def main():
    """
    # -------------------- * --------------------
    argparser = argparse.ArgumentParser('PG', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    argparser.add_argument('--gamma', action="store", type=float, default=0.99)
    argparser.add_argument('--learning-rate', action="store", type=float, default=8e-4)
    argparser.add_argument('--episode-train', action="store", type=int, default=500)

    argparser.add_argument('--output-save-img', action="store", type=str, default=None)

    args = argparser.parse_args(argv)
    gamma = args.gamma
    learning_rate = args.learning_rate
    episode_train = args.episode_train
    output_save_img = args.output_save_img
    # -------------------- * --------------------
    """
    
    gamma =  1
    episode_train = 50
    learning_rate = 1e-4

    maze = Maze(10,50)
    agent = Agent_circular(maze)
    
    """checkpoint_directory = "./tfmodel"
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt-1")
    checkpoint = tf.train.Checkpoint(Gate=agent.Gate)   
    checkpoint.restore(checkpoint_prefix)"""

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    average_reward, reward_curve = 0, []
    for rpt in range(10):
        maze = Maze(10,50)
        agent.maze = maze
        for eps in range(episode_train):

            agent.simple_reinit()
            observation_rollout, reward_rollout, action_rollout6, action_rollout7, action_rollout8, action_rollout9, action_rollout10, action_rollout11 = collect_experience(agent.maze, agent)
            discounted_reward_rollout = calculate_discout_reward(reward_rollout, gamma)

            # Remove last observation
            observation_rollout = observation_rollout[:-1]

            with tf.GradientTape() as tape:
                tape.watch(agent.Gate6.variables)
                all_prob6 = tf.log(tf.nn.softmax(agent.Gate6(
                    tf.convert_to_tensor(observation_rollout, dtype=tf.float32)
                )))
                
                all_prob_masked6 = tf.reduce_sum(action_rollout6 *  all_prob6, axis=-1)
                loss6 = tf.reduce_sum(all_prob_masked6 * discounted_reward_rollout * -1)
            grad6 = tape.gradient(loss6, agent.Gate6.variables)
            optimizer.apply_gradients(zip(grad6, agent.Gate6.variables))
                
                
            with tf.GradientTape() as tape:
                tape.watch(agent.Gate7.variables)
                all_prob7 = tf.log(tf.nn.softmax(agent.Gate7(
                    tf.convert_to_tensor(observation_rollout, dtype=tf.float32)
                )))
                
                all_prob_masked7 = tf.reduce_sum(action_rollout7 *  all_prob7, axis=-1)
                loss7 = tf.reduce_sum(all_prob_masked7 * discounted_reward_rollout * -1)
                
            grad7 = tape.gradient(loss7, agent.Gate7.variables)
            optimizer.apply_gradients(zip(grad7, agent.Gate7.variables))
            
            with tf.GradientTape() as tape:
                tape.watch(agent.Gate8.variables)
                all_prob8 = tf.log(tf.nn.softmax(agent.Gate8(
                    tf.convert_to_tensor(observation_rollout, dtype=tf.float32)
                )))
                all_prob_masked8 = tf.reduce_sum(action_rollout8 *  all_prob8, axis=-1)
                loss8 = tf.reduce_sum(all_prob_masked8 * discounted_reward_rollout * -1)
                
            grad8 = tape.gradient(loss8, agent.Gate8.variables)
            optimizer.apply_gradients(zip(grad8, agent.Gate8.variables))
            
            with tf.GradientTape() as tape:
                tape.watch(agent.Gate9.variables)
                all_prob9 = tf.log(tf.nn.softmax(agent.Gate9(
                    tf.convert_to_tensor(observation_rollout, dtype=tf.float32)
                )))
                all_prob_masked9 = tf.reduce_sum(action_rollout9 *  all_prob9, axis=-1)
                loss9 = tf.reduce_sum(all_prob_masked9 * discounted_reward_rollout * -1)
                
            grad9 = tape.gradient(loss9, agent.Gate9.variables)
            optimizer.apply_gradients(zip(grad9, agent.Gate9.variables))
                
            with tf.GradientTape() as tape:
                tape.watch(agent.Gate10.variables)
                all_prob10 = tf.log(tf.nn.softmax(agent.Gate10(
                    tf.convert_to_tensor(observation_rollout, dtype=tf.float32)
                )))
                all_prob_masked10 = tf.reduce_sum(action_rollout10 *  all_prob10, axis=-1)
                loss10 = tf.reduce_sum(all_prob_masked10 * discounted_reward_rollout * -1)
                
            grad10 = tape.gradient(loss10, agent.Gate10.variables)
            optimizer.apply_gradients(zip(grad10, agent.Gate10.variables))
                
            with tf.GradientTape() as tape:
                tape.watch(agent.Gate11.variables)
                all_prob11 = tf.log(tf.nn.softmax(agent.Gate11(
                    tf.convert_to_tensor(observation_rollout, dtype=tf.float32)
                )))
                all_prob_masked11 = tf.reduce_sum(action_rollout11 *  all_prob11, axis=-1)
                loss11 = tf.reduce_sum(all_prob_masked11 * discounted_reward_rollout * -1)

            grad11 = tape.gradient(loss11, agent.Gate11.variables)
            optimizer.apply_gradients(zip(grad11, agent.Gate11.variables))

            average_reward += np.mean(reward_rollout)

            if eps%5 == 0 and eps > 0:
                print("Currently in episode %d and the average reward is %f" % (eps, average_reward))
                #logger.info("Currently in episode {eps} and the average reward is {average_reward}" )
                reward_curve.append(average_reward)
                average_reward = 0

    #sns.lineplot(y=reward_curve, x=list(range(len(reward_curve))))
    plt.plot(range(len(reward_curve)), reward_curve)
    plt.show()
    #plt.savefig(output_save_img)
    
    checkpoint_directory = "./tfmodel"
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
    checkpoint = tf.train.Checkpoint(Gate=agent.Gate)
    checkpoint.save(file_prefix=checkpoint_prefix)
    #status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_path))

#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#print(' '.join(sys.argv))

main()



In [None]:
def test():        
    maze = Maze(7,10)
    maze.print_maze()
 
    agent = Agent_circular(maze)                    
    checkpoint_directory = "./tfmodel"
    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt-1")
    checkpoint = tf.train.Checkpoint(agent=agent)   
    checkpoint.restore(checkpoint_prefix)

  
    agent.perception()
    obs = agent.brain
    while(agent.end == False):
        
        out = agent.Gate(
            tf.expand_dims(
                tf.convert_to_tensor(obs, dtype=tf.float32), axis=0
            )
        )
        action_prob = tf.nn.softmax(out)

        # Sample
        samples = tf.random.multinomial(tf.log(action_prob), 1).numpy()[0][0]

        action = np.zeros((64))
        action[samples] = 1


        sample_str = bin(samples)
        sample = np.zeros(6, dtype=np.int)
        for i, v in enumerate(sample_str[2:]):
            sample[i]=int(v)
        agent.brain[6:] = sample
        print(agent.brain)
        
        _,r = agent.step()
        agent.perception()
        next_obs = np.copy(agent.brain)
        
        obs = next_obs
        
    print(agent.trajectory)
        
        
        
    
  

#np.random.seed(9)
test()
