In [2]:
import os
import sys
import numpy as np
import pandas as pd
import altair as alt

# Simulação simples

In [2]:
agent_list = os.listdir('agents')
environment_list = os.listdir('environments')

print(agent_list)
print(environment_list)

['ps_agent_basic.py', 'ps_agent_flexible.py', '__pycache__']
['env_invasion_game.py', 'env_invasion_game_lier.py', '__pycache__']


In [3]:
sys.path.insert(0, 'agents')
sys.path.insert(0, 'environments')

In [10]:
#environment
import env_invasion_game

#invasion_game requires no additional arguments
env = env_invasion_game.TaskEnvironment()

In [11]:
#agent
import ps_agent_basic

#parameters for the agent - explanations can be found in the comments inside the agent file
gamma_damping = 0
eta_glow_damping = 1
policy_type = 'softmax'
beta_softmax = 1
num_reflections =  0

agent = ps_agent_basic.BasicPSAgent(
    env.num_actions, env.num_percepts_list, 
    gamma_damping, eta_glow_damping, 
    policy_type, beta_softmax, 
    num_reflections
)

In [12]:
"""Initialise and run interaction"""

#set number of trials and maximum number of steps in each trial
num_trials = 50
max_steps_per_trial = 100 #This doesn't actually matter for invasion_game, which ends every trial after a single step

In [13]:
#initialise a record of performance
learning_curve = np.zeros(num_trials)
reward = 0 #temporarily stores the reward for the most recent action
for i_trial in range(num_trials):
    reward_trial = 0 #additive couSSSSSSnter of the total rewards earned during the current trial
    discretized_observation = env.reset()
    for t in range(max_steps_per_trial):
        #This is where the heart of the interaction takes place
        action = agent.deliberate_and_learn(discretized_observation, reward)
        discretized_observation, reward, done = env.move(action)
        reward_trial += reward
        if done:
            break

    learning_curve[i_trial] = float(reward_trial)/(t+1)

"""Return results"""
print(learning_curve)

[0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]


# Simulando parâmetros - agente básico - gamma_damping

In [31]:
agent_list = os.listdir('agents')
environment_list = os.listdir('environments')

print(agent_list)
print(environment_list)

['ps_agent_basic.py', 'ps_agent_flexible.py', '__pycache__']
['env_invasion_game.py', 'env_invasion_game_lier.py', 'env_neverending_color.py', '__pycache__']


In [32]:
sys.path.insert(0, 'agents')
sys.path.insert(0, 'environments')

In [126]:
#environment
import env_invasion_game_lier as environment_class

#invasion_game requires no additional arguments
env = environment_class.TaskEnvironment()

In [127]:
#agent
import ps_agent_basic

#parameters for the agent - explanations can be found in the comments inside the agent file
gamma_damping = 0
eta_glow_damping = 0.7
policy_type = 'softmax'
beta_softmax = 1
num_reflections =  0

db = pd.DataFrame() # guarda as informações da simulação
for gamma_damping in [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
  for episode in range(0,10):
    #env = environment_class.TaskEnvironment(2, 1, num_trials)
    env = environment_class.TaskEnvironment()
    agent = ps_agent_basic.BasicPSAgent(
        env.num_actions, env.num_percepts_list, 
        gamma_damping, eta_glow_damping, 
        policy_type, beta_softmax, 
        num_reflections
    )
    
    #initialise a record of performance
    learning_curve = []
    reward = 0 #temporarily stores the reward for the most recent action
    for i_trial in range(num_trials):
        reward_trial = 0 #additive counter of the total rewards earned during the current trial
        discretized_observation = env.reset()
        for t in range(max_steps_per_trial):
            #This is where the heart of the interaction takes place
            action = agent.deliberate_and_learn(discretized_observation, reward)
            discretized_observation, reward, done = env.move(action)
            reward_trial += reward
            if done:
                break

        learning_curve.append([i_trial+1, episode + 1, float(reward_trial)/(t+1), gamma_damping, eta_glow_damping, discretized_observation, action])

    _ = pd.DataFrame(learning_curve, columns = ['rodada','episode','reward','gamma_damping','eta_glow_damping','observation','action'])
    _['blocking_efficiency'] = _['reward'].cumsum()/(_['rodada'])
    db = pd.concat([db,_])

In [128]:
db.head(10)

Unnamed: 0,rodada,episode,reward,gamma_damping,eta_glow_damping,observation,action,blocking_efficiency
0,1,1,1.0,0.0,0.7,"[1, 0]",0,1.0
1,2,1,1.0,0.0,0.7,"[1, 0]",1,1.0
2,3,1,0.0,0.0,0.7,"[1, 1]",0,0.666667
3,4,1,0.0,0.0,0.7,"[0, 1]",1,0.5
4,5,1,1.0,0.0,0.7,"[0, 1]",1,0.6
5,6,1,1.0,0.0,0.7,"[1, 1]",1,0.666667
6,7,1,1.0,0.0,0.7,"[0, 0]",0,0.714286
7,8,1,1.0,0.0,0.7,"[0, 1]",0,0.75
8,9,1,1.0,0.0,0.7,"[0, 0]",1,0.777778
9,10,1,1.0,0.0,0.7,"[1, 1]",0,0.8


In [143]:
alt.Chart(db.groupby(['rodada','gamma_damping'])['blocking_efficiency'].mean().reset_index()).mark_line().encode(
      x = 'rodada',
      y = 'blocking_efficiency',
      color = 'gamma_damping:N', tooltip = ['gamma_damping']
)

# Grid World

In [37]:
agent_list = os.listdir('agents')
environment_list = os.listdir('environments')

print(agent_list)
print(environment_list)

sys.path.insert(0, 'agents')
sys.path.insert(0, 'environments')

['ps_agent_basic.py', 'ps_agent_flexible.py', '__pycache__']
['env_grid_world.py', 'env_invasion_game.py', 'env_invasion_game_lier.py', 'env_neverending_color.py', '__pycache__']


In [38]:
from time import sleep
from  IPython.display import clear_output
def grid(dimensions, position):
    grid = ""
    for j in range(dimensions[1]):
        for i in range(dimensions[0]):
            if ([i,j] == position).all():
                grid = grid + '|X'
            else:
                grid = grid + '| '
        grid = grid + '|\n'
    return grid

In [34]:
import env_grid_world as environment_class
import ps_agent_basic as agent_class

dimensions = [5,5]
env = environment_class.TaskEnvironment(dimensions)

#parameters for the agent - explanations can be found in the comments inside the agent file
gamma_damping = 0
eta_glow_damping = 0.12
policy_type = 'softmax'
beta_softmax = 1
num_reflections =  2

agent = agent_class.BasicPSAgent(
    env.num_actions, env.num_percepts_list, 
    gamma_damping, eta_glow_damping, 
    policy_type, beta_softmax, 
    num_reflections
)

"""Initialise and run interaction"""

#set number of trials and maximum number of steps in each trial
num_trials = 50
max_steps_per_trial = env.max_steps_per_trial

print("{}".format(grid(dimensions, env.position)), end="")

|X| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | | |


In [35]:
#initialise a record of performance
learning_curve = np.zeros(num_trials)
reward = 0 #temporarily stores the reward for the most recent action
for i_trial in range(num_trials):
    reward_trial = 0 #additive counter of the total rewards earned during the current trial
    discretized_observation = env.reset()
    for t in range(max_steps_per_trial):
        #This is where the heart of the interaction takes place
        action = agent.deliberate_and_learn(discretized_observation, reward)
        discretized_observation, reward, done = env.move(action)
        reward_trial += reward
        if done:
            break
    learning_curve[i_trial] = t+1

"""Return results"""
print(learning_curve)

[103.  26.  26.  27.  19.  11.   9.  22.   9.  13.  10.  15.  10.  11.
   8.   8.   8.  10.   8.   8.   8.   8.   8.   8.   8.   8.   8.   8.
   8.   8.   8.   8.   8.   8.   8.   8.   8.   8.   8.   8.   8.   8.
   8.   8.   8.   8.   8.   8.   8.   8.]


In [31]:
#initialise a record of performance
learning_curve = np.zeros(num_trials)
reward = 0 #temporarily stores the reward for the most recent action
for i_trial in range(num_trials):
    reward_trial = 0 #additive counter of the total rewards earned during the current trial
    discretized_observation = env.reset()
    for t in range(max_steps_per_trial):
        print(f'Trial {i_trial+1}')
        print(f'Step {t+1}')
        grid_ = grid(dimensions, env.position)
        print("{}".format(grid_), end="")
        clear_output(wait=True)
        sleep(0.1)
        #This is where the heart of the interaction takes place
        action = agent.deliberate_and_learn(discretized_observation, reward)
        discretized_observation, reward, done = env.move(action)
        reward_trial += reward
        if done:
            break
    learning_curve[i_trial] = t+1
    print(f'Trial {i_trial+1}')
    print(f'Step {t+1}')
    grid_ = grid(dimensions, env.position)
    print("{}".format(grid_), end="")
    clear_output(wait=True)
    sleep(1)

"""Return results"""
print(learning_curve)

KeyboardInterrupt: 

# Grid World - agente flexivel

In [54]:
agent_list = os.listdir('agents')
environment_list = os.listdir('environments')

print(agent_list)
print(environment_list)

sys.path.insert(0, 'agents')
sys.path.insert(0, 'environments')

['ps_agent_basic.py', 'ps_agent_flexible.py', '__pycache__']
['env_grid_world.py', 'env_invasion_game.py', 'env_invasion_game_lier.py', 'env_neverending_color.py', '__pycache__']


In [130]:
from time import sleep
from  IPython.display import clear_output
def grid(dimensions, position, old_positions = [], paredes = []):
    grid = ""
    for j in range(dimensions[1]):
        for i in range(dimensions[0]):

            if ([i,j] == position).all():
                grid = grid + '|X'
            elif ([i,j] in paredes):
                grid = grid + '|█'
            elif ([i,j] in old_positions[-20:]):
                grid = grid + '|.'
            else:
                grid = grid + '| '
        grid = grid + '|\n'
    return grid

In [137]:
# printando locais do mapa
from time import sleep
from  IPython.display import clear_output
grid_ = ""
for j in range(dimensions[1]):
        for i in range(dimensions[0]):
            grid_ = grid_ + f'[{i},{j}],'
        grid_ = grid_ + '\n'

print(grid_)

[0,0],[1,0],[2,0],[3,0],[4,0],[5,0],[6,0],[7,0],[8,0],[9,0],
[0,1],[1,1],[2,1],[3,1],[4,1],[5,1],[6,1],[7,1],[8,1],[9,1],
[0,2],[1,2],[2,2],[3,2],[4,2],[5,2],[6,2],[7,2],[8,2],[9,2],
[0,3],[1,3],[2,3],[3,3],[4,3],[5,3],[6,3],[7,3],[8,3],[9,3],
[0,4],[1,4],[2,4],[3,4],[4,4],[5,4],[6,4],[7,4],[8,4],[9,4],
[0,5],[1,5],[2,5],[3,5],[4,5],[5,5],[6,5],[7,5],[8,5],[9,5],
[0,6],[1,6],[2,6],[3,6],[4,6],[5,6],[6,6],[7,6],[8,6],[9,6],
[0,7],[1,7],[2,7],[3,7],[4,7],[5,7],[6,7],[7,7],[8,7],[9,7],
[0,8],[1,8],[2,8],[3,8],[4,8],[5,8],[6,8],[7,8],[8,8],[9,8],
[0,9],[1,9],[2,9],[3,9],[4,9],[5,9],[6,9],[7,9],[8,9],[9,9],



In [147]:
import env_grid_world as environment_class
import ps_agent_flexible as agent_class

dimensions = [10,10]
env = environment_class.TaskEnvironment(dimensions)

paredes = []
#paredes = [[3,3], [4,3], [5,3], [7,1], [7,2], [7,3], [7,4], [7,5], [7,9], [0,5], [0,6], [0,7], [2,6], [3,6], [4,6], [2,7], [4,7], [5,7], [6,7], [7,7], [7,6], [1,0], [1,1]]
#paredes = [[0,1],[1,1],[2,1],[3,1],[4,1],[5,1],[6,1],[7,1],[8,1], [2,3],[3,3],[4,3],[5,3],[6,3],[7,3],[8,3],[9,3], [0,5],[1,5],[2,5],[3,5],[4,5],[5,5],[6,5],[7,5],[8,5],[1,7],[2,7],[3,7],[4,7],[5,7],[6,7],[7,7],[8,7], [9,7], [0,9],[1,9],[2,9],[3,9],[4,9],[5,9],[6,9],[7,9],[8,9]]
env.walls = env.walls + paredes

#parameters for the agent - explanations can be found in the comments inside the agent file
gamma_damping = 0
eta_glow_damping = 0.12
policy_type = 'softmax'
beta_softmax = 1
num_reflections =  0

agent = agent_class.FlexiblePSAgent(
    env.num_actions,
    gamma_damping, eta_glow_damping, 
    policy_type, beta_softmax,
    num_reflections
)

"""Initialise and run interaction"""

#set number of trials and maximum number of steps in each trial
num_trials = 300
max_steps_per_trial = env.max_steps_per_trial

print("{}".format(grid(dimensions, env.position, [], env.walls)), end="")

|X| | | | | | | | | |
|█|█|█|█|█|█|█|█|█| |
| | | | | | | | | | |
| | |█|█|█|█|█|█|█|█|
| | | | | | | | | | |
|█|█|█|█|█|█|█|█|█| |
| | | | | | | | | | |
| |█|█|█|█|█|█|█|█|█|
| | | | | | | | | | |
|█|█|█|█|█|█|█|█|█| |


In [148]:
#initialise a record of performance
learning_curve = np.zeros(num_trials)
reward = 0 #temporarily stores the reward for the most recent action
for i_trial in range(num_trials):
    reward_trial = 0 #additive counter of the total rewards earned during the current trial
    discretized_observation = env.reset()
    for t in range(max_steps_per_trial):
        #This is where the heart of the interaction takes place
        action = agent.deliberate_and_learn(discretized_observation, reward)
        discretized_observation, reward, done = env.move(action)
        reward_trial += reward
        if done:
            break
    learning_curve[i_trial] = t+1

"""Return results"""
print(learning_curve)

[ 1990.  1681.  8643.  2784.  3985.  7013.  2505.  6469.  1236.  3596.
  3049.  3121. 12641.  2129.  1137.  3643.   647.  3302.  2380.  1792.
  3019.  2262.  3206.  2029.  1632.  1961.  1360.  4143.  6725.  2444.
  5918.  1423.  1661.  3842.  3430.   671.  3962.   283.   651.   151.
  3384.  1565.  2411.  5221.  1728.   941.  4517.  1057.  1777.   252.
  1387.  1168.  8444.  4845.   658.   932.  1174.  2113.   546.   645.
  1138.  1009.  2479.   999.   520.  1684.  3724.  1127.   658.   969.
  2986.   621.   942.   333.  1749.  1274.  1220.   367.  1997.  1288.
   526.  2085.   754.   868.  1278.   696.   408.   760.   786.  2152.
  1436.  4777.  1230.   815.  1021.   653.   354.   985.  1241.  2275.
  1614.   496.  1620.  1104.  2152.   589.   470.   984.  1385.   912.
   551.   649.  2968.   513.  1427.  1429.   927.  1513.   384.  3432.
   697.  1471.   732.   357.   619.  3806.   835.  1523.  1741.   887.
   293.   277.   666.   428.   717.   838.   347.   251.   652.   978.
   981

In [149]:
#initialise a record of performance
learning_curve = np.zeros(num_trials)
reward = 0 #temporarily stores the reward for the most recent action
for i_trial in range(num_trials):
    reward_trial = 0 #additive counter of the total rewards earned during the current trial
    discretized_observation = env.reset()
    old_positions = []
    for t in range(max_steps_per_trial):
        clear_output(wait=True)
        print(f'Trial {i_trial+1}')
        print(f'Step {t+1}')
        grid_ = grid(dimensions, env.position, old_positions, env.walls)
        old_positions.append(list(env.position))
        print("{}".format(grid_), end="")
        sleep(0.05)
        #This is where the heart of the interaction takes place
        action = agent.deliberate_and_learn(discretized_observation, reward)
        discretized_observation, reward, done = env.move(action)
        reward_trial += reward
        if done:
            break
    learning_curve[i_trial] = t+1
    sleep(3)
    clear_output(wait=True)

"""Return results"""
print(learning_curve)

Trial 4
Step 24
|.|.|X| | | | | | | |
|█|█|█|█|█|█|█|█|█| |
| | | | | | | | | | |
| | |█|█|█|█|█|█|█|█|
| | | | | | | | | | |
|█|█|█|█|█|█|█|█|█| |
| | | | | | | | | | |
| |█|█|█|█|█|█|█|█|█|
| | | | | | | | | | |
|█|█|█|█|█|█|█|█|█| |


KeyboardInterrupt: 