In [None]:
import gym
from adversarialgrid import AdversarialGrid

env = AdversarialGrid()

for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print(observation, reward, done, info)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

In [None]:
# A tabular Q-learning agent
import gym
import gym.spaces.discrete as discrete
from collections import defaultdict
import numpy as np

#print(discrete.Discrete)

class TabularQAgent(object):
    """
    Agent implementing tabular Q-learning.
    """

    def __init__(self, observation_space, action_space, **userconfig):
        if not isinstance(observation_space, discrete.Discrete):
            raise UnsupportedSpace('Observation space {} incompatible with {}. (Only supports Discrete observation spaces.)'.format(observation_space, self))
        if not isinstance(action_space, discrete.Discrete):
            raise UnsupportedSpace('Action space {} incompatible with {}. (Only supports Discrete action spaces.)'.format(action_space, self))
        self.observation_space = observation_space
        self.action_space = action_space
        self.action_n = action_space.n
        self.config = {
            "init_mean" : 0.0,      # Initialize Q values with this mean
            "init_std" : 0.0,       # Initialize Q values with this standard deviation
            "learning_rate" : 0.1,
            "eps": 0.05,            # Epsilon in epsilon greedy policies
            "discount": 0.95,
            "n_iter": 10000}        # Number of iterations
        self.config.update(userconfig)
        self.q = defaultdict(lambda: self.config["init_std"] * np.random.randn(self.action_n) + self.config["init_mean"])

    def act(self, observation, eps=None):
        if eps is None:
            eps = self.config["eps"]
        # epsilon greedy.
        action = np.argmax(self.q[observation]) if np.random.random() > eps else self.action_space.sample()
        return action

    def learn(self, env):
        config = self.config
        obs = env.reset()
        q = self.q
        for t in range(config["n_iter"]):
            action = self.act(obs)
            obs2, reward, done, _ = env.step(action)
            future = 0.0
            if not done:
                future = np.max(q[obs2])
            
            
            
            print("s, a, r, s': {}, {}, {}, {}".format(obs, action, reward, obs2))
            
            #Q(s,a) = Q(s,a) + alpha(r + gamma* max_{a'}Q(s',a') - Q(s,a))
            newq = q[obs][action] + self.config["learning_rate"] * (reward + config["discount"] * future - q[obs][action])
            print("Q(s,a) <- {}".format(newq))
            
            q[obs][action] = newq
            
            #q[obs][action] += self.config["learning_rate"] * (reward + config["discount"] * future - q[obs][action])
            #q[obs][action] -= \
            #    self.config["learning_rate"] * (q[obs][action] - reward - config["discount"] * future)

            obs = obs2


In [None]:
# testing the tabular q agent:
import gym
from pprint import pprint
from adversarialgrid import AdversarialGrid

env = AdversarialGrid()
agent = TabularQAgent(env.observation_space, env.action_space, eps=0.9, init_mean=1)

#train
agent.learn(env)
pprint(agent.q)


In [None]:
#test
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        action = agent.act(observation)
        observation, reward, done, info = env.step(action)
        print(observation, reward, done, info)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break