In [1]:
import sympy as sym

In [2]:
def mdp_encoder(P, r):
    """
    Args:
        P (np.array): The transition function. shape = [n_states, n_states, n_actions]
        r (np.array): The reward function. shape = [n_states, n_actions]

    Returns:
        p (np.array): the uncontrolled transition matrix
        q (np.array): the pseudo reward

    Needs to be solved for every state.
    """
    # QUESTION Are there similarities between the action embeddings in each state?
    # QUESTION How does this embedding change the set of policies that can be represented? Does this transformation preserve the optima?
    def embed_state(idx_x):
        """
        Solves
        r(s, a) = q(s) + E_s'~p(s' | s, a) log( p(s' | s, a) / p(x' | x))
        See supplementary material of todorov 2009 for more info
        """
        # b_a = r(x, a) - E_s'~p(s' | s, a) log( p(s' | s, a) / p(x' | x))
        b = r[idx_x] - np.sum(P[:, idx_x, :] * np.log(P[:, idx_x, :]+1e-8), axis=0)
        # D_ax' = p(x' | x, a)
        D = P[:, idx_x, :]

        # D(q.1 - m) = b, c = q.1 - m
        c = np.dot(b, np.linalg.pinv(D))
        q = -np.log(np.sum(np.exp(-c)))

        m = np.sum(q) - c
        p = np.exp(m)

        return p, [q]

    # TODO, should be able to solve these in parallel.
    # maybe even share some knowledge???!
    pnqs = [embed_state(i) for i in range(P.shape[0])]
    return tuple([np.stack(val, axis=1) for val in zip(*pnqs)])

In [10]:
p000, p001, p010, p011, p100, p101, p110, p111 = sym.symbols('p000, p001, p010, p011, p100, p101, p110, p111')
P = sym.Array([[[p000, p001], [p010, p011]], [[p100, p101], [p110, p111]]])

In [18]:
r00, r01, r10, r11 = sym.symbols('r00, r01, r10, r11')
r = sym.Matrix([[r00, r01], [r10, r11]])

[p000, p001, p010, p011, p100, p101, p110, p111]