In [None]:

import uuid
import hashlib
from typing import Dict, Optional
import pandas as pd

# --- Core AB assignment ---

def _validate_variants(variants: Dict[str, float]) -> None:
    """Validate variant weights without normalization."""
    if not variants:
        raise ValueError("Variants dictionary cannot be empty")

    total = 0.0
    for name, weight in variants.items():
        if weight < 0:
            raise ValueError(f"Variant '{name}' has negative weight: {weight}")
        total += weight

    if total <= 0:
        raise ValueError("Sum of variant weights must be > 0")

    if total > 1.0:
        raise ValueError(f"Sum of variant weights ({total}) exceeds 1.0")


def _hash_to_unit_interval(key: str) -> float:
    """Hash a key to a float in [0, 1)."""
    h = hashlib.sha256(key.encode("utf-8")).hexdigest()
    n = int(h[:15], 16)
    return n / (16 ** 15)


def assign_variant(
    entity_id: str,
    experiment_id: str,
    variants: Dict[str, float],
    *,
    salt: str = "global_ab_salt",
    layer_id: str = "default",
) -> Optional[str]:
    """
    Assign a variant to an entity based on deterministic hashing.

    Returns the variant name if the entity falls within experiment coverage,
    or None if the entity is not in the experiment.
    """
    _validate_variants(variants)

    key = f"{salt}|{layer_id}|{experiment_id}|{entity_id}"
    u = _hash_to_unit_interval(key)

    cumulative = 0.0
    for name, weight in sorted(variants.items()):
        if weight == 0:
            continue
        cumulative += weight
        if u < cumulative:
            return name

    return None  # Not in experiment


# --- DataFrame in -> DataFrame out ---

def assign_variants_df(
    df: pd.DataFrame,
    id_col: str,
    experiment_id: str,
    variants: Dict[str, float],
    *,
    salt: str = "global_ab_salt",
    layer_id: str = "default",
    variant_col: str = "variant"
) -> pd.DataFrame:
    """
    Deterministically assign variants for each row in df based on id_col.

    Returns a copy of df with an extra column `variant_col`.
    Entities outside experiment coverage will have None in the variant column.
    """
    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame")

    _validate_variants(variants)

    def _row_assign(entity_id: str) -> Optional[str]:
        key = f"{salt}|{layer_id}|{experiment_id}|{entity_id}"
        u = _hash_to_unit_interval(key)

        cumulative = 0.0
        for name, weight in sorted(variants.items()):
            if weight == 0:
                continue
            cumulative += weight
            if u < cumulative:
                return name

        return None  # Not in experiment

    out = df.copy()
    out[variant_col] = df[id_col].astype(str).map(_row_assign)
    return out

In [None]:
demo = pd.DataFrame({
    "user_id": [str(uuid.uuid4()) for _ in range(100)]
})

variants = {"control": 0.2, "treatment": 0.2}
assigned = assign_variants_df(
    demo,
    id_col="user_id",
    experiment_id="exp_signup_button_color_v1",
    salt = "global_ab_salt",
    variants=variants,
    layer_id="default",
    variant_col="treatment"
)

print(assigned.head())
print(assigned["treatment"].value_counts())


Because your hash key is your “random universe,” and those three fields control *which* universe you’re in.

You effectively hash:

```text
key = salt | layer_id | experiment_id | entity_id
```

Each piece solves a different problem.

---

### 1. `experiment_id`: isolate experiments + support reruns

Why include it?

* **Independence between experiments.**
  If you *didn’t* use `experiment_id`, the same user would always map to the same bucket across *all* experiments. That means:

  * If user is “treatment” once, they’d be “treatment” everywhere → correlated assignments → biased results when you compare across tests.
* **Stable within a single experiment.**
  Given fixed `experiment_id`, a user keeps their bucket forever (idempotent).
* **Versioning / reruns.**
  Want a fresh randomization? Use `experiment_id="exp_checkout_v2"` and you get a new random split without touching code.

Rule of thumb: **new logical test or new randomization → new `experiment_id`.**

---

### 2. `salt`: prevent gaming + cross-system collisions

Why include it?

* **Security / anti-gaming.**
  Without a secret salt, savvy users/devs could guess:

  * “If my user_id ends with X I get treatment, so I’ll farm accounts with that pattern.”
* **Isolation from other uses of hashing.**
  You might use SHA256 elsewhere; `salt` ensures the A/B assignment space is unique.
* **Safe to share IDs.**
  You can expose experiment names & variant labels in logs/BI without making it trivial to reverse or shape assignments.

Rule of thumb: **keep `salt` server-side, same across platform; rotate only with a migration plan** (rotation changes assignments).

---

### 3. `layer_id`: control mutual exclusivity & traffic layers

Think of `layer_id` as “which slot of the product UI this experiment lives in”.

Why include it?

* **Mutual exclusivity.**
  Suppose you have many experiments on the home screen. You don’t want one user in 5 conflicting treatments.

  * Put them in the same `layer_id="home_feed"`, then use the same hash stream to decide **which single experiment** (or variant) a user sees.
* **Independent surfaces.**
  Experiments on unrelated areas (e.g. `"search_page"` vs `"pricing_page"`) should not compete:

  * Different `layer_id` → independent assignments, even for same user.

Rule of thumb:

* Use **one layer per logical surface or exclusivity group**.
* Default `"default"` is fine if you don’t yet run overlapping tests.

---

So:

* `experiment_id` → “which experiment”.
* `layer_id` → “which exclusivity sandbox”.
* `salt` → “make the whole thing safe & non-predictable”.

Together they give you: deterministic, debuggable, replayable, non-correlated, and non-gameable assignments — i.e. what modern A/B platforms try to guarantee.


In [None]:
from causalis.scenarios.rct.rct_design import assign_variants_df
import pandas as pd
import uuid

In [None]:
demo = pd.DataFrame({
    "user_id": [str(uuid.uuid4()) for _ in range(100)]
})

variants = {"control": 0.2, "treatment": 0.2}
assigned = assign_variants_df(
    demo,
    id_col="user_id",
    experiment_id="exp_signup_button_color_v1",
    salt = "global_ab_salt",
    variants=variants,
    layer_id="default",
    variant_col="treatment"
)

print(assigned.head())
print(assigned["treatment"].value_counts())


How layer_id is intended to be used in a full platform:

You define a “layer router”:

Example: layer_id="home_feed" decides which one of several experiments (or none) a user enters.

Those experiments share the same layer to enforce mutual exclusivity.

# SRM

In [None]:
# from __future__ import annotations
#
# from dataclasses import dataclass
# from typing import Dict, Hashable, Iterable, Union
#
# import numpy as np
# import pandas as pd
#
# try:
#     from scipy.stats import chi2
# except ImportError as e:  # pragma: no cover
#     chi2 = None
#     _scipy_import_error = e
# else:
#     _scipy_import_error = None
#
#
# Number = Union[int, float]
#
#
# @dataclass
# class SRMResult:
#     """
#     Result of a Sample Ratio Mismatch (SRM) check.
#     """
#     chi2: float
#     df: int
#     p_value: float
#     expected: Dict[Hashable, float]
#     observed: Dict[Hashable, int]
#     alpha: float
#     is_srm: bool
#     warning: str | None = None
#
#     def __repr__(self) -> str:
#         status = "SRM DETECTED" if self.is_srm else "no SRM"
#         return (
#             f"SRMResult(status={status}, p_value={self.p_value:.3e}, "
#             f"chi2={self.chi2:.4f}, df={self.df})"
#         )
#
#
# def check_srm(
#     assignments: Union[Iterable[Hashable], pd.Series],
#     target_allocation: Dict[Hashable, Number],
#     alpha: float = 1e-3,
#     min_expected: float = 5.0,
#     strict_variants: bool = True,
# ) -> SRMResult:
#     """
#     Check Sample Ratio Mismatch (SRM) for an RCT via chi-square goodness-of-fit test.
#
#     Parameters
#     ----------
#     assignments:
#         Iterable of assigned variant labels for each unit (user_id, session_id, etc.).
#         E.g. Series of ["control", "treatment", ...].
#
#     target_allocation:
#         Mapping {variant: p} describing intended allocation as PROBABILITIES.
#         - Each p must be > 0.
#         - Sum of all p must be 1.0 (within numerical tolerance).
#
#         Examples:
#             {"control": 0.5, "treatment": 0.5}
#             {"A": 0.2, "B": 0.3, "C": 0.5}
#
#     alpha:
#         Significance level. Use strict values like 1e-3 or 1e-4 in production.
#
#     min_expected:
#         If any expected count < min_expected, a warning is attached.
#
#     strict_variants:
#         - True: fail if observed variants differ from target keys.
#         - False: drop unknown variants and test only on declared ones.
#
#     Returns
#     -------
#     SRMResult
#     """
#     # --- Prepare data
#     s = pd.Series(list(assignments)).dropna()
#     if s.empty:
#         raise ValueError("No assignments provided for SRM check.")
#
#     if not target_allocation:
#         raise ValueError("target_allocation cannot be empty.")
#
#     # Validate probabilities
#     probs = np.array(list(target_allocation.values()), dtype=float)
#
#     if (probs <= 0).any():
#         raise ValueError("All target allocation probabilities must be > 0.")
#
#     total = float(probs.sum())
#     if not np.isclose(total, 1.0, rtol=1e-6, atol=1e-8):
#         raise ValueError(
#             f"target_allocation probabilities must sum to 1.0, got {total:.6f}."
#         )
#
#     variants = list(target_allocation.keys())
#     target_map = dict(zip(variants, probs))
#
#     # Observed counts
#     if strict_variants:
#         unexpected = set(s.unique()) - set(variants)
#         if unexpected:
#             raise ValueError(
#                 f"Found assignments to variants not in target_allocation: {unexpected}"
#             )
#
#     if not strict_variants:
#         s = s[s.isin(variants)]
#         if s.empty:
#             raise ValueError(
#                 "After filtering to target variants, no assignments remain."
#             )
#
#     observed_counts = s.value_counts().reindex(variants).fillna(0).astype(int)
#     n = int(observed_counts.sum())
#     if n == 0:
#         raise ValueError("Total sample size is zero after preprocessing.")
#
#     # Expected counts from probabilities * n
#     expected_counts = np.array(
#         [target_map[v] * n for v in variants],
#         dtype=float
#     )
#
#     # Chi-square statistic
#     with np.errstate(divide="ignore", invalid="ignore"):
#         chi2_components = (observed_counts.values - expected_counts) ** 2 / expected_counts
#     chi2_components = np.nan_to_num(chi2_components, nan=0.0, posinf=0.0)
#     chi2_stat = float(chi2_components.sum())
#
#     # Degrees of freedom
#     k = (expected_counts > 0).sum()
#     df = max(k - 1, 1)
#
#     if chi2 is None:
#         raise ImportError(
#             "scipy is required for p-value computation in check_srm(). "
#             f"Original error: {_scipy_import_error}"
#         )
#
#     p_value = float(chi2.sf(chi2_stat, df))
#
#     warning = None
#     if (expected_counts < min_expected).any():
#         warning = (
#             f"Some expected cell counts are < {min_expected:.1f}. "
#             "Chi-square approximation may be unreliable; "
#             "consider exact or simulation-based tests."
#         )
#
#     is_srm = p_value < alpha
#
#     return SRMResult(
#         chi2=chi2_stat,
#         df=df,
#         p_value=p_value,
#         expected={v: float(e) for v, e in zip(variants, expected_counts)},
#         observed={v: int(o) for v, o in zip(variants, observed_counts)},
#         alpha=alpha,
#         is_srm=is_srm,
#         warning=warning,
#     )


In [None]:
from causalis.scenarios.rct import check_srm
import pandas as pd

target = {"control": 0.5, "treatment": 0.5}

# Realized assignments: 1200 vs 800 instead of 1000 vs 1000
df_bad = pd.DataFrame({
    "user_id": range(2000),
    "variant": ["control"] * 1200 + ["treatment"] * 800
})

res_bad = check_srm(
    assignments=df_bad["variant"],
    target_allocation=target,
    alpha=1e-3
)

print(res_bad)
print("Observed:", res_bad.observed)
print("Expected:", res_bad.expected)
print("is_srm:", res_bad.is_srm)
print("warning:", res_bad.warning)


In [1]:
import pandas as pd

target = {"control": 0.5, "treatment": 0.5}

# Realized assignments: 1200 vs 800 instead of 1000 vs 1000
df_bad = pd.DataFrame({
    "user_id": range(2000),
    "variant": ["control"] * 1200 + ["treatment"] * 800
})

res_bad = check_srm(
    assignments=df_bad["variant"],
    target_allocation=target,
    alpha=1e-3
)

print(res_bad)
print("Observed:", res_bad.observed)
print("Expected:", res_bad.expected)
print("is_srm:", res_bad.is_srm)
print("warning:", res_bad.warning)


SRMResult(status=SRM DETECTED, p_value=3.744e-19, chi2=80.0000, df=1)
Observed: {'control': 1200, 'treatment': 800}
Expected: {'control': 1000.0, 'treatment': 1000.0}
is_srm: True
