In [1]:
import numpy as np
import pandas as pd
from obp.dataset import OpenBanditDataset
from obp.policy import BernoulliTS
from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting as IPW, SelfNormalizedInverseProbabilityWeighting as SNIPW


# 1) Load Random/all logs

In [15]:
SEED = 123


ds = OpenBanditDataset(behavior_policy="random", campaign="all")
bf = ds.obtain_batch_bandit_feedback()
n = bf["n_rounds"]
print(f"Loaded Random/all: rounds={n:,}, n_actions={bf['n_actions']}, len_list={ds.len_list}")


INFO:obp.dataset.real:When `data_path` is not given, this class downloads the small-sized version of Open Bandit Dataset.


Loaded Random/all: rounds=10,000, n_actions=80, len_list=3


In [19]:
bf.keys()

dict_keys(['n_rounds', 'n_actions', 'action', 'position', 'reward', 'pscore', 'context', 'action_context'])

In [23]:
bf['context']

array([[1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=uint8)

In [24]:
bf['action_context']

array([[ 5.00000000e+00,  1.00000000e+01,  4.00000000e+00,
        -4.99171626e-01],
       [ 1.00000000e+00,  1.00000000e+01,  4.00000000e+00,
        -5.43775368e-01],
       [ 1.00000000e+00,  1.20000000e+01,  1.00000000e+00,
         9.72751855e-01],
       [ 2.00000000e+00,  1.50000000e+01,  6.00000000e+00,
        -5.21473497e-01],
       [ 1.00000000e+01,  1.00000000e+00,  1.00000000e+00,
         1.90943043e+00],
       [ 2.00000000e+00,  1.00000000e+01,  4.00000000e+00,
        -4.99171626e-01],
       [ 5.00000000e+00,  1.70000000e+01,  5.00000000e+00,
         3.48299469e-01],
       [ 8.00000000e+00,  1.00000000e+01,  4.00000000e+00,
        -2.76152917e-01],
       [ 2.00000000e+00,  1.80000000e+01,  4.00000000e+00,
        -6.55284723e-01],
       [ 5.00000000e+00,  1.40000000e+01,  0.00000000e+00,
        -3.87662271e-01],
       [ 1.00000000e+01,  1.10000000e+01,  4.00000000e+00,
         3.48299469e-01],
       [ 5.00000000e+00,  3.00000000e+00,  1.00000000e+00,
      

In [22]:
df = pd.DataFrame({
    "n_rouds": bf["n_rounds"],
"n_actions": bf["n_actions"],
    "action": bf["action"],
    "position": bf["position"],
    "reward": bf["reward"],
    "pscore": bf["pscore"],
    # "context": bf["context"],
    # "action_context": bf["action_context"]
})
df.head()

Unnamed: 0,n_rouds,n_actions,action,position,reward,pscore
0,10000,80,14,2,0,0.0125
1,10000,80,14,2,0,0.0125
2,10000,80,27,2,0,0.0125
3,10000,80,48,1,0,0.0125
4,10000,80,36,1,0,0.0125


# 2) Define evaluation policy = BTS with ZOZOTOWN prior
The definition of BernoulliTS is here 
https://github.com/st-tech/zr-obp/blob/master/obp/policy/contextfree.py
* Recall: Thompson Sampling assumes each action’s reward probability (CTR) follows a Beta(α, β) distribution.
* α ≈ number of observed successes (clicks).
* β ≈ number of observed failures (non-clicks).

So for action 0,
* alpha=47.0, beta=12198.0
Prior CTR ≈ 47 / (47 + 12198) ≈ 0.0038.

In [3]:
pi_e = BernoulliTS(
    n_actions=ds.n_actions,
    len_list=ds.len_list,
    is_zozotown_prior=True,
    campaign="all",
    random_state=SEED,
)
pi_e

BernoulliTS(n_actions=80, len_list=3, batch_size=1, random_state=123, alpha=[47.0, 8.0, 62.0, 142.0, 3.0, 14.0, 7.0, 857.0, 12.0, 15.0, 6.0, 100.0, 48.0, 23.0, 71.0, 61.0, 13.0, 16.0, 518.0, 30.0, 7.0, 4.0, 23.0, 8.0, 10.0, 11.0, 11.0, 18.0, 121.0, 11.0, 11.0, 10.0, 14.0, 9.0, 204.0, 58.0, 3.0, 19.0, 42.0, 1013.0, 2.0, 328.0, 15.0, 31.0, 14.0, 138.0, 45.0, 55.0, 23.0, 38.0, 10.0, 401.0, 52.0, 6.0, 3.0, 6.0, 5.0, 32.0, 35.0, 133.0, 52.0, 820.0, 43.0, 195.0, 8.0, 42.0, 40.0, 4.0, 32.0, 30.0, 9.0, 22.0, 6.0, 23.0, 5.0, 54.0, 8.0, 22.0, 65.0, 246.0], beta=[12198.0, 3566.0, 15993.0, 35522.0, 2367.0, 4609.0, 3171.0, 181745.0, 4372.0, 4951.0, 3100.0, 24665.0, 13210.0, 7061.0, 18061.0, 17449.0, 5644.0, 6787.0, 111326.0, 8776.0, 3334.0, 2271.0, 7389.0, 2659.0, 3665.0, 4724.0, 3561.0, 5085.0, 27407.0, 4601.0, 4756.0, 4120.0, 4736.0, 3788.0, 45292.0, 14719.0, 2189.0, 5589.0, 11995.0, 222255.0, 2308.0, 70034.0, 4801.0, 8274.0, 5421.0, 31912.0, 12213.0, 13576.0, 6230.0, 10382.0, 4141.0, 85731.0, 12

In [31]:
type(pi_e)

obp.policy.contextfree.BernoulliTS

In [35]:
len(pi_e.alpha), len(pi_e.beta)

(80, 80)

# 3) Compute evaluation action distribution for each round (factorized by slot)

In [6]:
# after compute_batch_action_dist(...)
print("raw action_dist shape:", action_dist.shape)  # e.g., (10000, 80, 3)

# 1) If it's 2D, tile across positions
if action_dist.ndim == 2:  # (n_rounds, n_actions)
    action_dist = np.repeat(action_dist[:, None, :], ds.len_list, axis=1)

# 2) If it's 3D but axis order is (n_rounds, n_actions, len_list), swap to (n_rounds, len_list, n_actions)
if action_dist.ndim == 3 and action_dist.shape[1] == ds.n_actions and action_dist.shape[2] == ds.len_list:
    action_dist = np.swapaxes(action_dist, 1, 2)

# final sanity checks
assert action_dist.shape == (n, ds.len_list, ds.n_actions), action_dist.shape

# normalize over the *action* axis (last axis) just in case
sums = action_dist.sum(axis=2, keepdims=True)
action_dist = action_dist / np.clip(sums, 1e-12, None)

# verify each position’s probs sum to 1
sums_check = action_dist.sum(axis=2)  # (n_rounds, len_list)
assert np.allclose(sums_check, 1.0, atol=1e-6), (sums_check.min(), sums_check.max())
print("action_dist OK:", action_dist.shape)

raw action_dist shape: (10000, 80, 3)
action_dist OK: (10000, 3, 80)


In [10]:
import numpy as np

print("shape:", action_dist.shape, "dtype:", action_dist.dtype)
print("nan? ", np.isnan(action_dist).any())
print("min/max:", float(action_dist.min()), float(action_dist.max()))

# per-(round,slot) sums should be exactly 1
sums = action_dist.sum(axis=2)
print("sum range:", float(sums.min()), float(sums.max()))

bad_neg   = (action_dist < 0).any()
bad_gt1   = (action_dist > 1).any()
bad_nan   = np.isnan(action_dist).any()
bad_sum   = (~np.isclose(sums, 1.0, atol=1e-6)).any()

print("neg?", bad_neg, "| >1?", bad_gt1, "| NaN?", bad_nan, "| sums!=1?", bad_sum)

# how many slots have sum==0 (totally invalid)
zero_sum = (sums <= 0).sum()
print("zero-sum slots:", int(zero_sum))

shape: (10000, 3, 80) dtype: float64
nan?  False
min/max: 1.0000000000000004e-05 0.24366000000000004
sum range: 0.9999999999999999 1.0000000000000002
neg? False | >1? False | NaN? False | sums!=1? False
zero-sum slots: 0


In [13]:
import numpy as np
from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting as IPW, SelfNormalizedInverseProbabilityWeighting as SNIPW

def ensure_3d_action_dist(ad, len_list, n_actions):
    """Return (n_rounds, len_list, n_actions) valid probs."""
    ad = np.asarray(ad, dtype=np.float64)

    # (n, A) -> tile across positions
    if ad.ndim == 2 and ad.shape[1] == n_actions:
        ad = np.repeat(ad[:, None, :], len_list, axis=1)

    # If axes are (n, A, L) -> swap to (n, L, A)
    if ad.ndim == 3 and ad.shape[1] == n_actions and ad.shape[2] == len_list:
        ad = np.swapaxes(ad, 1, 2)

    # Final shape guard
    assert ad.ndim == 3 and ad.shape[1] == len_list and ad.shape[2] == n_actions, f"bad shape {ad.shape}"

    # Clean & renormalize over the action axis (last)
    ad = np.nan_to_num(ad, nan=0.0, posinf=0.0, neginf=0.0)
    ad = np.clip(ad, 0.0, None)
    sums = ad.sum(axis=2, keepdims=True)
    zero = sums <= 0
    if np.any(zero):
        # fill any degenerate slot with uniform
        ad[zero.repeat(n_actions, axis=2)] = 1.0 / n_actions
        sums = ad.sum(axis=2, keepdims=True)
    ad /= sums
    # sanity
    sums_check = ad.sum(axis=2)
    assert np.allclose(sums_check, 1.0, atol=1e-6), (sums_check.min(), sums_check.max())
    return ad

# 1) Make sure action_dist is valid 3D
action_dist_3d = ensure_3d_action_dist(action_dist, ds.len_list, ds.n_actions)
print("action_dist_3d:", action_dist_3d.shape, "sum range:",
      float(action_dist_3d.sum(axis=2).min()), float(action_dist_3d.sum(axis=2).max()))

# 2) Run OPE (pass the 3D array)
ope = OffPolicyEvaluation(bandit_feedback=bf, ope_estimators=[IPW(), SNIPW()])
est = ope.estimate_policy_values(action_dist=action_dist_3d)

# 3) Report
logged = bf["reward"].mean()
ipw_val  = est["ipw"]
snipw_val = est["snipw"]
print(f"Logged avg reward (Random/all): {logged:.6f}")
print(f"IPW estimate for BTS:            {ipw_val:.6f}")
print(f"SNIPW estimate for BTS:          {snipw_val:.6f}")
print(f"Relative (IPW / logged):         {ipw_val/logged:.3f}x")

# 4) Optional importance-weight sanity (taken action at logged slot)
row = np.arange(bf["n_rounds"])
pi_e_taken = action_dist_3d[row, bf["position"], bf["action"]]
w = pi_e_taken / bf["pscore"]
print("Mean importance weight (≈1):", w.mean())
ess = (w.sum()**2) / (w**2).sum()
print(f"ESS: {ess:.0f} / {bf['n_rounds']}")

action_dist_3d: (10000, 3, 80) sum range: 0.9999999999999998 1.0000000000000002


ValueError: `action_dist` must be a probability distribution

ope = OffPolicyEvaluation(bandit_feedback=bf, ope_estimators=[IPW(), SNIPW()])
est = ope.estimate_policy_values(action_dist=action_dist)