In [12]:
"""Generate Sample Trajectories"""

import numpy as np
from itertools import product
from importlib import reload

import flmdp
from flmdp import FLMDP
import policy_approximators
from policy_approximators import naive_approx
from policy_approximators import sparsity_corrected_approx
import step_is as step_is_module
from importlib import reload

from math import sqrt

reload(flmdp)
reload(policy_approximators)
reload(step_is_module)
step_is = step_is_module.step_is


# L-MDP params
mag_S = 9
mag_A = 4
l = 4

# Deterministic initial state distribution
P0 = np.zeros((mag_S))
P0[0] = 1.0

# Trajectory Params
T = 20
m = 100
Gamma = 0.9
gamma = 0.9

rmses_naive = list()
rmses_sc = list()
perfect_returns = list()
naive_returns = list()
sc_returns = list()
true_returns = list()

for iteration in range(1000):
    P = FLMDP.random_P(mag_S=mag_S,
                      mag_A=mag_A,
                      l=l)
    lmdp = FLMDP(mag_S=mag_S,
                 mag_A=mag_A,
                 P=P,
                 P0=P0,
                 l=l)
    
    pi_b = FLMDP.scips_approximable_pi(lmdp=lmdp,
                                       Gamma=Gamma,
                                       sigma=0)
    pi_e = FLMDP.random_pi(lmdp=lmdp)

    s_b, a_b, r_b = lmdp.simulate(pi=pi_b,
                                  T=T, 
                                  m=m)
    s_e, a_e, r_e = lmdp.simulate(pi=pi_e,
                                  T=T, 
                                  m=m)
    
    # Naive Monte-Carlo Policy Estimator
    hat_b = naive_approx(states=s_b,
                         actions=a_b,
                         rewards=r_b,
                         l=l)

    # Sparsity Corrected Policy Estimator
    tilde_b = sparsity_corrected_approx(states=s_b,
                                        actions=a_b,
                                        rewards=r_b,
                                        Gamma=Gamma,
                                        lmdp=lmdp)
    

    rho_pi = step_is(pi_b=pi_b, 
                     pi_e=pi_e,
                     state_samples=s_b,
                     action_samples=a_b,
                     reward_samples=r_b,
                     l=l,
                     gamma=gamma)
    rho_hat = step_is(pi_b=hat_b,
                      pi_e=pi_e,
                      state_samples=s_b,
                      action_samples=a_b,
                      reward_samples=r_b,
                      l=l,
                      gamma=gamma)
    rho_tilde = step_is(pi_b=tilde_b,
                        pi_e=pi_e,
                        state_samples=s_b,
                        action_samples=a_b,
                        reward_samples=r_b,
                        l=l,
                        gamma=gamma)
    
    # All the data is there, now to turn it into statistics
    
    # First we compute the RMSE of the two approximations
    rmse_naive = 0.0
    rmse_sc = 0.0
    for action in range(mag_A):
        for s1, s2, s3, s4 in product(range(mag_S), range(mag_S), range(mag_S), range(mag_S)):
            rmse_naive += (pi_b[s1,s2,s3,s4,action] - hat_b.get((s1,s2,s3,s4,action),0))**2
            rmse_sc += (pi_b[s1,s2,s3,s4,action] - tilde_b.get((s1,s2,s3,s4,action),0))**2
    rmses_naive.append(sqrt(rmse_naive))
    rmses_sc.append(sqrt(rmse_sc))
    
    # Then we record the estimated return from the three methods
    perfect_returns.append(rho_pi)
    naive_returns.append(rho_hat)
    sc_returns.append(rho_tilde)
    
    # Now let's estimate the true return from our sample.
    total_returns = 0.0
    for episode in range(r_e.shape[0]):
        r = 0.0
        for time in range(r_e.shape[1]):
            r += (gamma ** time) + r_e[episode, time]
        total_returns += r
    average_return = total_returns / r_e.shape[0]
    true_returns.append(average_return)

In [16]:
import pickle
pickle.dump(rmses_naive, open('rmses_native.pickle', 'wb'))
pickle.dump(rmses_sc, open('rmses_sc.pickle', 'wb'))
pickle.dump(perfect_returns, open('perfect_returns.pickle', 'wb'))
pickle.dump(naive_returns, open('naive_returns.pickle', 'wb'))
pickle.dump(sc_returns, open('sc_returns.pickle', 'wb'))
pickle.dump(true_returns, open('true_returns.pickle', 'wb'))

In [18]:
a = pickle.load(open('rmses_native.pickle', 'r'))
b = pickle.load(open('rmses_sc.pickle', 'r'))
c = pickle.load(open('perfect_returns.pickle', 'r'))
d = pickle.load(open('naive_returns.pickle', 'r'))
e = pickle.load(open('sc_returns.pickle', 'r'))
f = pickle.load(open('true_returns.pickle', 'r'))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte