In [1]:
import numpy as np
import itertools
import matplotlib.pyplot as plt
import numpy.random as rnd

Want to make some Bellman error matrices as defined [here](https://arxiv.org/abs/1610.09512).

In [2]:
n_states = 2
n_actions = 2

In [3]:
def generate_rnd_problem(n_states, n_actions):
    P = rnd.random((n_states * n_actions, n_states))**2
    P = P/P.sum(axis=1, keepdims=True)
    r = rnd.random((n_states * n_actions, 1))
    return P, r

P, r = generate_rnd_problem(n_states, n_actions)
P.shape, r.shape

((4, 2), (4, 1))

In [5]:
# def low_rank_approx(M, r=1):
#     n, m = M.shape
#     u, s, vT = np.linalg.svd(A, full_matrices=False)
#     A = np.zeros((n, m))
#     for i in range(r):
#         A += s[i] * np.outer(u.T[i], v[i])
#     return A

# low_rank_approx()

In [6]:
vals = np.linspace(0.0, 1.0, 5)
print(vals)

[0.   0.25 0.5  0.75 1.  ]


In [7]:
# function class
# $f \in \{X \times A \to [0, 1]\}$
fn_class = list(itertools.product(*[vals for _ in range(n_actions * n_states)]))
len(fn_class)

625

In [8]:
def generate_Mpi(n_states, n_actions, ps):
    """
    A policy is represented by a block diagonal |S| x |S||A| matrix M.

    Args:
        n_states (int): the number of states
        n-actions (int): the number of actions
        ps (array[n_states, n_actions]): the probabilities of taking action a in state s.

    Returns:
        (array[n_states, n_states x n_actions])
    """
    A = np.ones((1, n_actions))
    S = np.eye(n_states)

    M_pi = np.zeros((n_states, n_states * n_actions))
    M_pi[np.where(np.equal(1, np.kron(S, A)))] = ps.reshape(-1)
    return M_pi

def pi(M_pi, s, a):
    """
    Let state s be indexed by i and the action a be indexed by j.
    Then we have that M[i, i x |A| + j] = pi(a|s)
    """
    return M_pi[s, s*n_actions+a]

def normalise(x, axis):
    return x/np.sum(x, axis=axis, keepdims=True)

def exp_dist(x, lambda_=3.5):  # why 3.5!?
    return lambda_*np.exp(-lambda_*x)

def uniform_simplex(shape):
    # god damn. it's not as simple as I thought to generate uniform distributions on simplexs
    # https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex
    # http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf
    return normalise(exp_dist(rnd.random(shape)),axis=1)

def generate_rnd_policy(n_states, n_actions):
    return generate_Mpi(n_states, n_actions, uniform_simplex((n_states, n_actions)))



In [None]:


def bellman_error(f, pi):
    d_pi = discounted_state_visitation_distribution(d_0, pi, P)  # the discounted distribution over states
    deltas = f(s, pi(s)) - r - f(s, greedy_policy(f))
    return np.dot(d_pi, deltas.T)  # the expectation

def bellman_error_matrix(F, P, r, t):
    n = len(F)
    E = np.zeros((n, n))
    for i, f in enumerate(F):
        for j, g in enumerate(F):
            E[i, j] = bellman_error(f, greedy_policy(g))
    return E