amazon-science · thomaspinder · Sep 2, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -97,6 +97,7 @@ exclude_lines = [
   "if __name__ == .__main__.:",
   "if TYPE_CHECKING:",
 ]
+show_missing = true
 
 [tool.black]
 line-length = 88

diff --git a/src/causal_validation/__about__.py b/src/causal_validation/__about__.py
@@ -1,3 +1,3 @@
-__version__ = "0.0.2"
+__version__ = "0.0.3"
 
 __all__ = ["__version__"]
diff --git a/src/causal_validation/config.py b/src/causal_validation/config.py
@@ -7,11 +7,12 @@
 
 import numpy as np
 
+from causal_validation.types import (
+    Number,
+    WeightTypes,
+)
 from causal_validation.weights import UniformWeights
 
-if tp.TYPE_CHECKING:
-    from causal_validation.types import WeightTypes
-
 
 @dataclass(kw_only=True, frozen=True)
 class WeightConfig:
@@ -23,8 +24,8 @@ class Config:
     n_control_units: int
     n_pre_intervention_timepoints: int
     n_post_intervention_timepoints: int
-    global_mean: float = 20.0
-    global_scale: float = 0.2
+    global_mean: Number = 20.0
+    global_scale: Number = 0.2
     start_date: dt.date = dt.date(year=2023, month=1, day=1)
     seed: int = 123
     weights_cfg: WeightConfig = field(default_factory=WeightConfig)

diff --git a/src/causal_validation/data.py b/src/causal_validation/data.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from copy import deepcopy
 from dataclasses import dataclass
 import datetime as dt
@@ -26,7 +28,9 @@ class Dataset:
     _start_date: dt.date
     counterfactual: tp.Optional[Float[np.ndarray, "M 1"]] = None
 
-    def to_df(self, index_start: str = "2023-01-01") -> pd.DataFrame:
+    def to_df(
+        self, index_start: str = dt.date(year=2023, month=1, day=1)
+    ) -> pd.DataFrame:
         inputs = np.vstack([self.Xtr, self.Xte])
         outputs = np.vstack([self.ytr, self.yte])
         data = np.hstack([outputs, inputs])
@@ -54,11 +58,11 @@ def n_timepoints(self) -> int:
         return self.n_post_intervention + self.n_pre_intervention
 
     @property
-    def control_units(self) -> Float[np.ndarray, "N+M 1"]:
+    def control_units(self) -> Float[np.ndarray, "{self.n_timepoints} {self.n_units}"]:
         return np.vstack([self.Xtr, self.Xte])
 
     @property
-    def treated_units(self) -> Float[np.ndarray, "N+M 1"]:
+    def treated_units(self) -> Float[np.ndarray, "{self.n_timepoints} 1"]:
         return np.vstack([self.ytr, self.yte])
 
     @property
@@ -94,24 +98,37 @@ def _get_columns(self) -> tp.List[str]:
         colnames = ["T"] + [f"C{i}" for i in range(self.n_units)]
         return colnames
 
-    def _get_index(self, start_date: str) -> pd.Series:
+    def _get_index(self, start_date: dt.date) -> DatetimeIndex:
         return pd.date_range(start=start_date, freq="D", periods=self.n_timepoints)
 
     def _get_indicator(self) -> Integer[np.ndarray, "N 1"]:
         indicator = np.vstack(
             [
-                np.zeros(shape=(self.n_pre_intervention, 1)),
-                np.ones(shape=(self.n_post_intervention, 1)),
+                np.zeros(shape=(self.n_pre_intervention, 1)).astype(np.int64),
+                np.ones(shape=(self.n_post_intervention, 1)).astype(np.int64),
             ]
         )
         return indicator
 
-    def inflate(self, inflation_vals: Float[np.ndarray, "M 1"]) -> "Dataset":
+    def inflate(self, inflation_vals: Float[np.ndarray, "M 1"]) -> Dataset:
         Xtr, ytr = [deepcopy(i) for i in self.pre_intervention_obs]
         Xte, yte = [deepcopy(i) for i in self.post_intervention_obs]
         inflated_yte = yte * inflation_vals
         return Dataset(Xtr, Xte, ytr, inflated_yte, self._start_date, yte)
 
+    def __eq__(self, other: Dataset) -> bool:
+        ytr = np.allclose(self.ytr, other.ytr)
+        yte = np.allclose(self.yte, other.yte)
+        if self.Xtr.shape == other.Xtr.shape:
+            xtr = np.allclose(self.Xtr, other.Xtr)
+        else:
+            xtr = False
+        if self.Xte.shape == other.Xte.shape:
+            xte = np.allclose(self.Xte, other.Xte)
+        else:
+            xte = False
+        return all([xtr, ytr, xte, yte])
+
     def to_azcausal(self):
         time_index = np.arange(self.n_timepoints)
         data = self.to_df().assign(time=time_index).melt(id_vars=["time", "treated"])
@@ -128,3 +145,25 @@ def to_azcausal(self):
     @property
     def _slots(self) -> tp.Dict[str, int]:
         return {"n_units": self.n_units + 1, "n_timepoints": self.n_timepoints}
+
+    def drop_unit(self, idx: int) -> Dataset:
+        Xtr = np.delete(self.Xtr, [idx], axis=1)
+        Xte = np.delete(self.Xte, [idx], axis=1)
+        return Dataset(
+            Xtr, Xte, self.ytr, self.yte, self._start_date, self.counterfactual
+        )
+
+    def to_placebo_data(self, to_treat_idx: int) -> Dataset:
+        ytr = self.Xtr[:, to_treat_idx].reshape(-1, 1)
+        yte = self.Xte[:, to_treat_idx].reshape(-1, 1)
+        dropped_data = self.drop_unit(to_treat_idx)
+        placebo_data = reassign_treatment(dropped_data, ytr, yte)
+        return placebo_data
+
+
+def reassign_treatment(
+    data: Dataset, ytr: Float[np.ndarray, "N 1"], yte: Float[np.ndarray, "M 1"]
+) -> Dataset:
+    Xtr = data.Xtr
+    Xte = data.Xte
+    return Dataset(Xtr, Xte, ytr, yte, data._start_date, data.counterfactual)
diff --git a/src/causal_validation/types.py b/src/causal_validation/types.py
@@ -9,3 +9,4 @@
 WeightTypes = tp.Literal["uniform", "non-uniform"]
 InterventionTypes = tp.Literal["pre-intervention", "post-intervention", "both"]
 RandomVariable = tp.Union[rv_continuous, rv_discrete]
+Number = tp.Union[float, int]
diff --git a/src/causal_validation/weights.py b/src/causal_validation/weights.py
@@ -16,10 +16,10 @@
 class AbstractWeights(BaseObject):
     name: str = "Abstract Weights"
 
-    def _get_weights(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, " D"]:
+    def _get_weights(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, "D 1"]:
         raise NotImplementedError("Please implement `_get_weights` in all subclasses.")
 
-    def get_weights(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, " D"]:
+    def get_weights(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, "D 1"]:
         weights = self._get_weights(obs)
 
         np.testing.assert_almost_equal(
@@ -42,7 +42,7 @@ def weight_obs(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, "N 1"]:
 class UniformWeights(AbstractWeights):
     name: str = "Uniform Weights"
 
-    def _get_weights(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, " D"]:
+    def _get_weights(self, obs: Float[np.ndarray, "N D"]) -> Float[np.ndarray, "D 1"]:
         n_units = obs.shape[1]
         return np.repeat(1.0 / n_units, repeats=n_units).reshape(-1, 1)
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,9 @@
 from hypothesis import settings
+from jaxtyping import install_import_hook
 
 settings.register_profile(
     "causal_validation", database=None, max_examples=10, deadline=None
 )
 settings.load_profile("causal_validation")
+with install_import_hook("causal_validation", "beartype.beartype"):
+    import causal_validation  # noqa: F401
diff --git a/tests/test_causal_validation/test_amzn_synthetic_causal_data_gen.py b/tests/test_causal_validation/test_amzn_synthetic_causal_data_gen.py
diff --git a/tests/test_causal_validation/test_data.py b/tests/test_causal_validation/test_data.py
@@ -1,3 +1,5 @@
+from copy import deepcopy
+
 from azcausal.estimators.panel.did import DID
 from hypothesis import (
     given,
@@ -7,8 +9,12 @@
 import numpy as np
 import pandas as pd
 from pandas.core.indexes.datetimes import DatetimeIndex
+import pytest
 
-from causal_validation.data import Dataset
+from causal_validation.data import (
+    Dataset,
+    reassign_treatment,
+)
 from causal_validation.testing import (
     TestConstants,
     simulate_data,
@@ -166,3 +172,90 @@ def test_get_index(n_post_treatment: int, n_pre_treatment: int, idx: Interventio
         assert len(idx_vals) == n_post_treatment
     elif idx == "pre-intervention":
         assert len(idx_vals) == n_pre_treatment
+
+
+@pytest.mark.parametrize("n_pre, n_post, n_control", [(60, 30, 10), (60, 30, 20)])
+def test_drop_unit(n_pre: int, n_post: int, n_control: int):
+    constants = TestConstants(
+        N_POST_TREATMENT=n_post,
+        N_PRE_TREATMENT=n_pre,
+        N_CONTROL=n_control,
+    )
+    data = simulate_data(0.0, DEFAULT_SEED, constants=constants)
+    desired_shape_Xtr = (n_pre, n_control - 1)
+    desired_shape_Xte = (n_post, n_control - 1)
+    desired_shape_ytr = (n_pre, 1)
+    desired_shape_yte = (n_post, 1)
+
+    for i in range(n_control):
+        reduced_data = data.drop_unit(i)
+        assert reduced_data.Xtr.shape == desired_shape_Xtr
+        assert reduced_data.Xte.shape == desired_shape_Xte
+        assert reduced_data.ytr.shape == desired_shape_ytr
+        assert reduced_data.yte.shape == desired_shape_yte
+
+
+@pytest.mark.parametrize("n_pre, n_post, n_control", [(60, 30, 10), (60, 30, 20)])
+def test_to_placebo(n_pre: int, n_post: int, n_control: int):
+    constants = TestConstants(
+        N_POST_TREATMENT=n_post,
+        N_PRE_TREATMENT=n_pre,
+        N_CONTROL=n_control,
+    )
+    data = simulate_data(0.0, DEFAULT_SEED, constants=constants)
+    desired_shape_Xtr = (n_pre, n_control - 1)
+    desired_shape_Xte = (n_post, n_control - 1)
+    desired_shape_ytr = (n_pre, 1)
+    desired_shape_yte = (n_post, 1)
+
+    for i in range(n_control):
+        placebo_data = data.to_placebo_data(i)
+        assert placebo_data.Xtr.shape == desired_shape_Xtr
+        assert placebo_data.Xte.shape == desired_shape_Xte
+        assert placebo_data.ytr.shape == desired_shape_ytr
+        assert placebo_data.yte.shape == desired_shape_yte
+        assert not data == placebo_data
+
+
+@given(
+    n_control=st.integers(min_value=2, max_value=50),
+    n_pre_treatment=st.integers(min_value=10, max_value=50),
+    n_post_treatment=st.integers(min_value=10, max_value=50),
+    global_mean=st.floats(
+        min_value=-5.0, max_value=5.0, allow_infinity=False, allow_nan=False
+    ),
+)
+@settings(max_examples=10)
+def test_eq(
+    n_control: int, n_pre_treatment: int, n_post_treatment: int, global_mean: float
+):
+    constants = TestConstants(
+        N_POST_TREATMENT=n_post_treatment,
+        N_PRE_TREATMENT=n_pre_treatment,
+        N_CONTROL=n_control,
+    )
+    data = simulate_data(global_mean, DEFAULT_SEED, constants=constants)
+    copied_data = deepcopy(data)
+    assert data == copied_data
+
+    # Shape mismatch
+    for i in range(n_control):
+        reduced_data = data.drop_unit(i)
+        assert not data == reduced_data
+
+
+@pytest.mark.parametrize("n_pre, n_post, n_control", [(60, 30, 10), (60, 30, 20)])
+def test_reassign_treatment(n_pre: int, n_post: int, n_control: int):
+    constants = TestConstants(
+        N_POST_TREATMENT=n_post,
+        N_PRE_TREATMENT=n_pre,
+        N_CONTROL=n_control,
+    )
+    data = simulate_data(0.0, DEFAULT_SEED, constants=constants)
+    to_assign_ytr = np.ones(shape=(n_pre, 1))
+    to_assign_yte = np.ones(shape=(n_post, 1))
+
+    reassigned_data = reassign_treatment(data, to_assign_ytr, to_assign_yte)
+    assert not data == reassigned_data
+    np.testing.assert_equal(reassigned_data.ytr, to_assign_ytr)
+    np.testing.assert_equal(reassigned_data.yte, to_assign_yte)