# Analysis of Behavior Policy Estimators

## File IO

In [87]:
import pandas as pd
from collections import deque
from itertools import islice, product
from numpy import unique

l = 3
Gamma = 0.9

In [16]:
actions_path = "./output/test_actions.csv"
actions = pd.read_csv(actions_path, header=None)
states_path = "./output/test_states.csv"
states = pd.read_csv(states_path, header=None)
rewards_path = "./output/test_rewards.csv"
rewards = pd.read_csv(rewards_path, header=None)

In [20]:
T = actions.shape[1]
m = actions.shape[0]
assert(rewards.shape == actions.shape)
assert(states.shape[0] == m)
assert(states.shape[1] - 1 == T)

In [55]:
list(states.iterrows())

[(0, 0    1
  1    2
  2    3
  3    1
  4    5
  5    4
  6    2
  Name: 0, dtype: int64), (1, 0    3
  1    2
  2    1
  3    5
  4    3
  5    2
  6    3
  Name: 1, dtype: int64), (2, 0    5
  1    3
  2    4
  3    5
  4    2
  5    3
  6    1
  Name: 2, dtype: int64)]

## Naive Estimator

In [74]:
history_counts = {}
history_action_counts = {}
history = deque((0,) * l, maxlen=l)
for i, trajectory in states.iterrows():
    for j, state in islice(trajectory.items(), 0, T):
        history.appendleft(state);
        history_counts[tuple(history)] = history_counts.get(tuple(history), 0) + 1
        context = (tuple(history), actions[j][i])
        history_action_counts[context] = history_action_counts.get(context, 0) + 1
        
naive_history_action_probabilities = {}
for (history, action), count in history_action_counts.items():
    naive_history_action_probabilities[(history, action)] = float(count) / float(history_counts[history])

In [76]:
naive_history_action_probabilities

{((1, 0, 0), 1): 1.0,
 ((2, 1, 0), 2): 1.0,
 ((3, 2, 1), 3): 1.0,
 ((1, 3, 2), 1): 1.0,
 ((5, 1, 3), 2): 1.0,
 ((4, 5, 1), 1): 1.0,
 ((3, 4, 5), 3): 1.0,
 ((2, 3, 4), 3): 1.0,
 ((1, 2, 3), 3): 1.0,
 ((5, 1, 2), 3): 1.0,
 ((3, 5, 1), 3): 1.0,
 ((2, 3, 5), 3): 1.0,
 ((5, 2, 3), 2): 1.0,
 ((3, 5, 2), 3): 1.0,
 ((4, 3, 5), 1): 1.0,
 ((5, 4, 3), 3): 1.0,
 ((2, 5, 4), 2): 1.0,
 ((3, 2, 5), 1): 1.0}

## SC estimator

In [122]:
positional_state_counts = {}
positional_state_action_counts = {}
history = deque((0,) * l, maxlen=l)
for i, trajectory in states.iterrows():
    for j, state in islice(trajectory.items(), 0, T):
        history.appendleft(state);
        for k, state_in_history in enumerate(history):
            state_context = (state_in_history, k)
            positional_state_counts[state_context] = positional_state_counts.get(state_context, 0) + 1
            state_action_context = (state_in_history, k, actions[j][i])
            positional_state_action_counts[state_action_context] = positional_state_action_counts.get(state_action_context, 0) + 1
        
def sc_probability(history, action):
    probability = 0
    for i in range(l):
        if positional_state_counts.get((history[i], i), 0.0) > 0.0:
            probability += Gamma**i * positional_state_action_counts.get((history[i], i, action), 0.0) \
                           / positional_state_counts.get((history[i], i), 0.0)
    return probability * (1.0 - Gamma) / (1.0 - Gamma**(l))
        
sc_history_action_probabilities = {}
for history, action in product(product(list(unique(states.values)) + [0], repeat=l), unique(actions.values)):
    sc_history_action_probabilities[history, action] = sc_probability(history, action)

In [123]:
sc_history_action_probabilities

{((1, 1, 1), 1): 0.3456334563345634,
 ((1, 1, 1), 2): 0.22140221402214025,
 ((1, 1, 1), 3): 0.43296432964329645,
 ((1, 1, 2), 1): 0.3456334563345634,
 ((1, 1, 2), 2): 0.22140221402214025,
 ((1, 1, 2), 3): 0.43296432964329645,
 ((1, 1, 3), 1): 0.24600246002460027,
 ((1, 1, 3), 2): 0.37084870848708484,
 ((1, 1, 3), 3): 0.3831488314883149,
 ((1, 1, 4), 1): 0.24600246002460027,
 ((1, 1, 4), 2): 0.37084870848708484,
 ((1, 1, 4), 3): 0.3831488314883149,
 ((1, 1, 5), 1): 0.395448954489545,
 ((1, 1, 5), 2): 0.22140221402214025,
 ((1, 1, 5), 3): 0.3831488314883149,
 ((1, 1, 0), 1): 0.395448954489545,
 ((1, 1, 0), 2): 0.37084870848708484,
 ((1, 1, 0), 3): 0.23370233702337023,
 ((1, 2, 1), 1): 0.42865928659286595,
 ((1, 2, 1), 2): 0.0830258302583026,
 ((1, 2, 1), 3): 0.48831488314883154,
 ((1, 2, 2), 1): 0.42865928659286595,
 ((1, 2, 2), 2): 0.0830258302583026,
 ((1, 2, 2), 3): 0.48831488314883154,
 ((1, 2, 3), 1): 0.32902829028290287,
 ((1, 2, 3), 2): 0.23247232472324728,
 ((1, 2, 3), 3): 0.4384

In [124]:
# Sanity check: These should all be one
a = 0.0
for i in range(1,4):
    a += sc_history_action_probabilities[(1,1,1),i]
print(a)
a = 0.0
for i in range(1,4):
    a += sc_history_action_probabilities[(1,1,2),i]
print(a)
a = 0.0
for i in range(1,4):
    a += sc_history_action_probabilities[(1,4,1),i]
print(a)

1.0
1.0
1.0000000000000002
