In [1]:
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('../utils')
from GridWorld import get_standard_grid, get_negative_grid
from iterative_policy_eval import printValues, printPolicy
from sklearn.kernel_approximation import Nystroem, RBFSampler

GAMMA = 0.9 # discount rate
ALPHA = 0.01 # learning rate
ACTIONS = ('U', 'D', 'L', 'R')

In [3]:
def epsilon_greedy(greedy, s, eps=0.1):
    p = np.random.random()
    if p < (1 - eps):
        return greedy[s]
    else:
        return np.random.choice(ACTIONS)

def gather_samples(grid, n_episodes=10000):
    samples = []
    for _ in range(n_episodes):
        s = grid.reset()
        samples.append(s)
        while not grid.game_over():
            a = np.random.choice(ACTIONS)
            r = grid.move(a)
            s = grid.current_state()
            samples.append(s)
    return samples

In [4]:
class Model:
    def __init__(self, grid):
        samples = gather_samples(grid)
        self.featurizer = RBFSampler()
        self.featurizer.fit(samples)

        dims = self.featurizer.random_offset_.shape[0]
        self.w = np.zeros(dims)

    def predict(self, s):
        x = self.featurizer.transform([s])[0]
        return x @ self.w

    def grad(self, s):
        x = self.featurizer.transform([s])[0]
        return x

In [None]:
grid = get_standard_grid()

print("rewards:")
printValues(grid.rewards, grid)

greedy_policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U'
}

model = Model(grid)
mse_per_episode = []

n_episodes = 20000
for it in range(n_episodes):
    if (it + 1) % 100 == 0:
        print(it + 1)

    s = grid.reset()
    Vs = model.predict(s)
    n_steps = 0
    episode_err = 0

    while not grid.game_over():
        a = epsilon_greedy(greedy_policy, s)
        r = grid.move(a)
        s2 = grid.current_state()

        if grid.is_terminal(s2):
            target = r
        else:
            Vs2 = model.predict(s2)
            target = r + GAMMA * Vs2

        g = model.grad(s)
        err = target - Vs
        model.w += ALPHA * err * g

        # accumulate err
        n_steps += 1
        episode_err += err * err

        s = s2
        Vs = Vs2

    mse = episode_err / n_steps
    mse_per_episode.append(mse)

plt.plot(mse_per_episode)
plt.title("MSE per episode")
plt.show()

V = {}
states = grid.all_states()
for s in states:
    if s in grid.actions:
        V[s] = model.predict(s)
    else:
        V[s] = 0 # terminal state

print("values:")
printValues(V, grid)
print("policy:")
printPolicy(greedy_policy, grid)