In [None]:



# -----------------------------
# Vectorized learning on fixed pool
# -----------------------------

def simulate_rewards_fixed_pool(depths, T, r0, r1, c, env_tau, rng, low_effort_step=True):
    """
    Vectorized lifetime rewards for agents interacting with a fixed pool of environments.
    Agents sample an environment index uniformly at start and after each failed block.
    Tracks per-agent:
      - env_idx: current environment id (int)
      - invested: attempts spent on the current environment (int)
    """
    N = depths.shape[0]
    M = env_tau.shape[0]

    depths = depths.astype(int, copy=False)
    remaining = np.full(N, T, dtype=int)
    reward    = np.zeros(N, dtype=float)

    # per-agent env state
    env_idx  = rng.integers(0, M, size=N, endpoint=False)
    invested = np.zeros(N, dtype=int)

    # convenience: finite tau mask on the fly by indexing
    while True:
        active = remaining > 0
        if not np.any(active):
            break

        can_invest = active & (invested < depths)
        if not np.any(can_invest):
            # switch environments for those who hit the quota without unlocking
            need_switch = active & (invested >= depths)
            if not np.any(need_switch):
                break

            if low_effort_step:
                take_lo = need_switch & (remaining > 0)
                reward[take_lo]   += r0
                remaining[take_lo] -= 1

            # reset env/attempts
            invested[need_switch] = 0
            env_idx[need_switch] = rng.integers(0, M, size=need_switch.sum(), endpoint=False)
            continue

        # one high-effort action for all who can invest
        reward[can_invest]   -= c
        remaining[can_invest] -= 1
        invested[can_invest] += 1

        # check unlocks: invested >= tau(current env) and tau finite
        tau_now = env_tau[env_idx]          # shape (N,)
        finite  = np.isfinite(tau_now)
        unlock_now = can_invest & finite & (invested.astype(float) >= tau_now)

        if np.any(unlock_now):
            reward[unlock_now]   += r1 * remaining[unlock_now]
            remaining[unlock_now] = 0
            # they are done; env_idx/invested irrelevant after remaining=0

        # those who reached depth without unlocking will be handled in next pass
        # (when can_invest becomes False for them)

    reward[reward < 0.0] = 0.0
    return reward

# -----------------------------
# ABM wrapper with fixed pool
# -----------------------------

def reproduce_with_mutation(depths, fitness, m, d_max, rng, epsilon=1e-12):
    N = len(depths)
    weights = np.asarray(fitness, dtype=float) + epsilon
    p = weights / weights.sum()
    parents = rng.choice(N, size=N, replace=True, p=p)
    new_depths = depths[parents].copy()

    mutate = rng.random(N) < m
    dirs = np.where(rng.random(N) < 0.5, -1, +1)
    new_depths[mutate] += dirs[mutate]
    # reflect at boundaries
    new_depths[new_depths < 1] = 2 - new_depths[new_depths < 1]
    new_depths[new_depths > d_max] = 2*d_max - new_depths[new_depths > d_max]
    return new_depths

def run_abm_fixed_pool(
    generations, N, T,
    r0, r1, c,
    pi, D_spec,
    m, d_max,
    M_env=200_000,
    seed=0,
    init_depths=None,
    record_hist=True
):
    """
    ABM with a fixed environment pool of size M_env created once at the beginning.
    Agents sample environments uniformly (with replacement) at each switch.
    """
    rng = np.random.default_rng(seed)

    # Build fixed pool once
    env_tau = build_environment_pool(M_env, pi, D_spec, seed=rng.integers(1<<31))

    # Initialize depths
    if init_depths is None:
        depths = rng.integers(1, d_max+1, size=N)
    else:
        depths = np.asarray(init_depths, dtype=int).copy()
        if depths.shape != (N,):
            raise ValueError("init_depths must have shape (N,)")

    # History
    history = None
    if record_hist:
        hist_mean = np.zeros(generations+1)
        hist_var  = np.zeros(generations+1)
        hist_dist = np.zeros((generations+1, d_max))
        hist_mean[0] = depths.mean()
        hist_var[0]  = depths.var()
        counts = np.bincount(depths, minlength=d_max+1)[1:]
        hist_dist[0] = counts / counts.sum()

    # Generational loop
    for g in range(1, generations+1):
        fitness = simulate_rewards_fixed_pool(depths, T, r0, r1, c, env_tau, rng)

        depths = reproduce_with_mutation(depths, fitness, m=m, d_max=d_max, rng=rng)

        if record_hist:
            hist_mean[g] = depths.mean()
            hist_var[g]  = depths.var()
            counts = np.bincount(depths, minlength=d_max+1)[1:]
            hist_dist[g] = counts / counts.sum()

    if record_hist:
        history = {"mean_depth": hist_mean,
                   "var_depth": hist_var,
                   "depth_distribution": hist_dist}
    return depths, history

# -----------------------------
# Example
# -----------------------------
if __name__ == "__main__":
    generations = 200
    N = 5000
    T = 40
    r0, r1, c = 1.0, 6.0, 1.0
    pi = 0.4
    D_spec = {"name": "mixture",
              "alpha": 0.2,
              "tau_easy": 3,
              "hard": {"name": "geometric", "p": 0.03}}
    m = 0.05
    d_max = 60
    M_env = 300_000
    seed = 123

    depths, hist = run_abm_fixed_pool(
        generations, N, T,
        r0, r1, c, pi, D_spec,
        m, d_max,
        M_env=M_env,
        seed=seed,
        record_hist=True
    )
    print("Final mean depth:", hist["mean_depth"][-1])
    print("Depth dist d=1..15:", np.round(hist["depth_distribution"][-1, :15], 3))


In [None]:
import numpy as np

In [None]:
# -----------------------------
# Environment pool
# -----------------------------


# Simple exponential distribution
def get_tau(M, pt, rng):
    return rng.exponential(scale=1/pt, size=M)


def build_environment_pool(M, pi, pt, seed):
    """
    Build a fixed pool of M environments once.
    Each environment has tau in {1,2,...} if unlockable defining the number of hard learning attempts to unlock the environment; else tau = -1.
    """
    rng = np.random.default_rng(seed)
    M_unlockable = int(M*pi)
    unlockable = np.zeros(M, dtype=bool)
    unlockable[:M_unlockable] = True
    tau_vals = get_tau(M_unlockable, pt, rng)
    tau = np.full(M, -1, dtype=int)
    tau[unlockable] = tau_vals
    return tau  # shape (M,), -1 for non-unlockable

In [None]:
# -----------------------------
# Vectorized learning on fixed pool
# -----------------------------

def simulate_rewards_fixed_pool(agent_depths, agent_persistence, agent_T, env_r0, env_r1, env_c, env_tau, rng, low_effort_step=True):
    """
    Vectorized lifetime rewards for agents interacting with a fixed pool of environments.
    Agents sample an environment index uniformly at start and after each failed block.
    Tracks per-agent:
      - env_idx: current environment id (int)
      - invested: attempts spent on the current environment (int)
    """
    N = agent_depths.shape[0] # number of agents
    M = env_tau.shape[0] # number of environments

    assert agent_depths.dtype == int
    assert agent_T.dtype == int
    assert env_r0.dtype == float
    assert env_r1.dtype == float
    assert env_c.dtype == float
    assert env_tau.dtype == int

    assert agent_T.shape == (N,)
    remaining = agent_T.copy()
    reward    = np.zeros(N, dtype=float)
    exhausted = np.zeros(N, dtype=bool)
    unlocked = np.zeros(N, dtype=bool)

    # per-agent env state
    env_idx  = rng.integers(0, M, size=N, endpoint=False)
    invested = np.zeros(N, dtype=int)

    # convenience: finite tau mask on the fly by indexing
    while True:
        active = remaining > 0
        if not np.any(active):
            break

        # agents that can invest
        can_invest = active & (invested < agent_depths)
        
        need_switch = active & (invested >= agent_depths)
        might_exhaust = rng.random(N) < agent_persistence
        will_exhaust = need_switch & might_exhaust
        exhausted = exhausted | will_exhaust







        if not np.any(can_invest):
            # switch environments for those who hit the quota without unlocking
            need_switch = active & (invested >= depths)
            if not np.any(need_switch):
                break

            if low_effort_step:
                take_lo = need_switch & (remaining > 0)
                reward[take_lo]   += r0
                remaining[take_lo] -= 1

            # reset env/attempts
            invested[need_switch] = 0
            env_idx[need_switch] = rng.integers(0, M, size=need_switch.sum(), endpoint=False)
            continue

        # one high-effort action for all who can invest
        reward[can_invest]   -= c
        remaining[can_invest] -= 1
        invested[can_invest] += 1

        # check unlocks: invested >= tau(current env) and tau finite
        tau_now = env_tau[env_idx]          # shape (N,)
        finite  = np.isfinite(tau_now)
        unlock_now = can_invest & finite & (invested.astype(float) >= tau_now)

        if np.any(unlock_now):
            reward[unlock_now]   += r1 * remaining[unlock_now]
            remaining[unlock_now] = 0
            # they are done; env_idx/invested irrelevant after remaining=0

        # those who reached depth without unlocking will be handled in next pass
        # (when can_invest becomes False for them)

    reward[reward < 0.0] = 0.0
    return reward

In [None]:
import numpy as np

def interact_hidden_path(
    depths,                    # (N,) int, per-agent exploration depth d_i
    T,                         # int, per-agent lifetime budget (steps/attempts)
    env_M,                     # (M_env,) int, fixed pool of environment difficulties M_e >= 1
    theta,                     # float in (0,1), per-step "right turn" probability
    rho,                       # float in [0,1], persistence: stay after failed attempt
    r_min=1.0, alpha=0.2, beta=1.2,  # parameters for R(M) = r_min + alpha * M**beta
    r_fail=0.0,
    rng=None                   # np.random.Generator (optional)
):
    """
    Vectorized agentâ€“environment interaction for the active-reset hidden-path model.

    Returns
    -------
    rewards : (N,) float
        Realized cumulative reward per agent (non-negative).
    """
    if rng is None:
        rng = np.random.default_rng()

    N = depths.shape[0]
    M_env = env_M.shape[0]
    depths = depths.astype(int, copy=False)
    env_M = env_M.astype(int, copy=False)

    # Reward mapping R(M)
    def R_of_M(M):
        val = r_min + alpha * np.power(M, beta)
        return val

    # State
    remaining = np.full(N, int(T), dtype=int)
    rewards   = np.zeros(N, dtype=float)

    # Assign initial environments
    env_idx = rng.integers(0, M_env, size=N, endpoint=False)

    # Precompute theta^M for all envs (speeds up repeated masking)
    theta_pow_all = np.power(theta, env_M)
    

    # Main event loop (attempt-wise, fully vectorized across agents)
    while True:
        active = remaining > 0
        if not np.any(active):
            break

        # Effective attempt depth k_i = min(d_i, remaining_i)
        k = np.minimum(depths, remaining)

        # Per-attempt success probability: q = 1{M<=k} * theta^M
        M_now   = env_M[env_idx]
        feasible = (M_now <= k)
        q = theta_pow_all[env_idx].copy()
        q[~feasible] = 0.0

        # Draw successes
        u = rng.random(N)
        success = (u < q) & active
        fail = (~success) & active

        # Steps spent this attempt
        steps_spent = np.zeros(N, dtype=int)
        steps_spent[success] = M_now[success]
        steps_spent[fail] = k[fail]

        # Decrement remaining
        remaining -= steps_spent

        # Handle successes: exploitation (optional) then always switch to a fresh environment if time remains
        rewards[success] += R_of_M(M_now[success])
        rewards[fail] += r_fail

        # Stay/switch draws only for failing, still-active agents
        switch = fail & (rng.random(N) > rho)

        # After success, if budget remains, draw new environment
        need_new_env = (switch | success) & (remaining > 0)

        if np.any(need_new_env):
            env_idx[need_new_env] = rng.integers(0, M_env, size=need_new_env.sum())

    return rewards
