In [3]:
def run(agent, env, episodes=20, episode_duration=100, render=True, verbose=False):
    """
    Runs a reinforcement learning experiment
    :param agent: implements the method act(observation)
    :param env: an instance of gym.Env
    :param episodes: number of episodes to run
    :param episode_duration: number of steps of each episode
    :param render: render each step?
    :param verbose: print transition information?
    """
    for ep in range(episodes):
        observation = env.reset()
        acc_reward = 0
        if render:
            env.render() # show initial state
        for t in range(100):         
            # print(observation)
            action = agent.act(observation)
            observation, reward, done, info = env.step(action)
            acc_reward += reward
            if verbose:
                print(observation, reward, done, info)          
            if render:
                env.render() 
                
            if done:
                print("Episode {} finished after {} timesteps w/ total reward {}".format(ep+1, t+1, acc_reward))
                break


In [None]:
# test a random agent
import gym
import gym_adversarialgrid.envs.adversarialgrid as adversarialgrid
import gym_adversarialgrid.agents.adversary as agent
#from gym_adversarialgrid.envs.adversarialgrid import AdversarialGrid


env = adversarialgrid.AdversarialGrid(opponent='Fixed', action=adversarialgrid.NOOP, map='3x4')

agent = agent.Random(env.observation_space, env.action_space)

run(agent, env, render=False)


In [None]:
# testing the tabular q agent, vs fixed NOOP adversary
from pprint import pprint
import gym_adversarialgrid.agents.tabular as tabular

env = adversarialgrid.AdversarialGrid(opponent='Fixed', action=adversarialgrid.NOOP, map='3x4')
agent = tabular.TabularQAgent(env.observation_space, env.action_space, eps=0.1, init_mean=1)

#train
agent.train(env, 10000)
#pprint(agent.q)

#test
agent.config['eps'] = 0 #all greedy o/
run(agent, env, render=False)

In [None]:
# tabular q vs fixed deflector
env = adversarialgrid.AdversarialGrid(opponent='Fixed', action=adversarialgrid.DEFLECT, map='3x4')
agent = tabular.TabularQAgent(env.observation_space, env.action_space, eps=0.1, init_mean=1)

#train
agent.train(env, 10000)
#pprint(agent.q)

#test
agent.config['eps'] = 0 #all greedy o/
#run(agent, env, render=True, verbose=True)
run(agent, env, render=False, verbose=False)

In [None]:
# tabular q vs random
import gym_adversarialgrid.envs.adversarialgrid as adversarialgrid
import gym_adversarialgrid.agents.tabular as tabular

env = adversarialgrid.AdversarialGrid(opponent='Random', map='3x4')
agent = tabular.TabularQAgent(env.observation_space, env.action_space, eps=0.1, init_mean=1)

#train
agent.train(env, 100000)
#pprint(agent.q)

#test
agent.config['eps'] = 0 #all greedy o/
env.print_deterministic_policy(agent.greedy_policy())
#run(agent, env, render=True, verbose=True)
run(agent, env, render=False, verbose=False)

In [5]:
# SG-Exp3 q vs Fixed-NOOP
import gym_adversarialgrid.envs.adversarialgrid as adversarialgrid
import gym_adversarialgrid.agents.sgexp3 as sgexp3

env = adversarialgrid.AdversarialGrid(opponent='Fixed', map='3x4', action=adversarialgrid.NOOP)
agent = sgexp3.SGExp3(env.observation_space, env.action_space, gamma=0.2)

#train
agent.train(env, 1000)
#pprint(agent.q)

#test
agent.config['eps'] = 0 #all greedy o/
#env.print_deterministic_policy(agent.greedy_policy())
#run(agent, env, render=True, verbose=True)
run(agent, env, render=True, verbose=True)

  pi_s = [((1 - g) * value / sum_weights) + (g / n) for a, value in enumerate(self.q[s])]



______
| HG |
|[41mS[0m   |
|    |
‾‾‾‾‾‾

(1, 0) 0 False {'a_name': 'x', 'a_idx': 4, 'r_name': 'x', 'o_idx': 0, 'tile': 'S', 'o_name': 'No-op', 'r_idx': 4}
  (x)
______
| HG |
|[41mS[0m   |
|    |
‾‾‾‾‾‾

(1, 0) 0 False {'a_name': 'x', 'a_idx': 4, 'r_name': 'x', 'o_idx': 0, 'tile': 'S', 'o_name': 'No-op', 'r_idx': 4}
  (x)
______
| HG |
|[41mS[0m   |
|    |
‾‾‾‾‾‾

(1, 0) 0 False {'a_name': 'x', 'a_idx': 4, 'r_name': 'x', 'o_idx': 0, 'tile': 'S', 'o_name': 'No-op', 'r_idx': 4}
  (x)
______
| HG |
|[41mS[0m   |
|    |
‾‾‾‾‾‾

(1, 0) 0 False {'a_name': 'x', 'a_idx': 4, 'r_name': 'x', 'o_idx': 0, 'tile': 'S', 'o_name': 'No-op', 'r_idx': 4}
  (x)
______
| HG |
|[41mS[0m   |
|    |
‾‾‾‾‾‾

(1, 0) 0 False {'a_name': 'x', 'a_idx': 4, 'r_name': 'x', 'o_idx': 0, 'tile': 'S', 'o_name': 'No-op', 'r_idx': 4}
  (x)
______
| HG |
|[41mS[0m   |
|    |
‾‾‾‾‾‾

(1, 0) 0 False {'a_name': 'x', 'a_idx': 4, 'r_name': 'x', 'o_idx': 0, 'tile': 'S', 'o_name': 'No-op', 'r_idx': 4}
  (x)
______
| 