In [1]:
"""Generate Sample Trajectories"""

import numpy as np

from flmdp import FLMDP

import flmdp
import policy_approximators

from importlib import reload
reload(flmdp)
reload(policy_approximators)

# L-MDP params
mag_S = 9
mag_A = 4
l = 4

# Deterministic initial state distribution
P0 = np.zeros((mag_S))
P0[0] = 1.0

P = FLMDP.random_P(mag_S=mag_S,
                  mag_A=mag_A,
                  l=l)

lmdp = FLMDP(mag_S=mag_S,
             mag_A=mag_A,
             P=P,
             P0=P0,
             l=l)

# Trajectory Params
T = 20
m = 100
Gamma = 0.9

pi_b = FLMDP.scips_approximable_pi(lmdp=lmdp,
                                   Gamma=Gamma,
                                   sigma=1)

pi_e = FLMDP.random_pi(mag_S=mag_S,
                       mag_A=mag_A,
                       l=l)

s_b, a_b, r_b = lmdp.simulate(pi=pi_b,
                              T=T, 
                              m=m)
s_e, a_e, r_e = lmdp.simulate(pi=pi_e,
                              T=T, 
                              m=m)

print(s_t)

IndexError: index 100 is out of bounds for axis 0 with size 100

In [48]:
"""Create Policy Estimators"""

import pandas as pd

from policy_approximators import naive_approx
from policy_approximators import sparsity_corrected_approx

s_df = pd.DataFrame(s_b)
a_df = pd.DataFrame(a_b)
r_df = pd.DataFrame(r_b)

# Naive Monte-Carlo Policy Estimator
hat_b = naive_approx(states=s_df,
                     actions=a_df,
                     rewards=r_df,
                     l=l)

# Sparsity Corrected Policy Estimator
Gamma = 0.9
tilde_b = sparsity_corrected_approx(states=s_df,
                                    actions=a_df,
                                    rewards=r_df,
                                    l=l,
                                    Gamma=Gamma)

In [49]:
"""Evaluate Policy Estimators"""

import step_is as step_is_module
from importlib import reload
reload(step_is_module)

step_is = step_is_module.step_is

gamma = 0.9

rho_pi = step_is(pi_b=pi_b, 
                 pi_e=pi_e,
                 state_samples=s_b,
                 action_samples=a_b,
                 reward_samples=r_b,
                 l=l,
                 gamma=gamma)

rho_hat = step_is(pi_b=hat_b,
                  pi_e=pi_e,
                  state_samples=s_b,
                  action_samples=a_b,
                  reward_samples=r_b,
                  l=l,
                  gamma=gamma)

rho_tilde = step_is(pi_b=tilde_b,
                    pi_e=pi_e,
                    state_samples=s_b,
                    action_samples=a_b,
                    reward_samples=r_b,
                    l=l,
                    gamma=gamma)    

print(rho_pi)
print(rho_hat)
print(rho_tilde)

-3.4462512978052833
-0.21621007254419836
-8.649278300203472
