Coerce model dimension dtypes to numeric (#575)

calliope-project · Feb 29, 2024 · 41f9060 · 41f9060
1 parent 0240bcc
commit 41f9060
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,9 @@ Both the index and the values of the timeseries (both being date strings) should
 
 |changed| `inbuilt` math -> `pre-defined` math and `custom` math -> `pre-defined` math in the documentation.
 
+|fixed| Dimensions with numeric data can be defined in tabular data _or_ YAML and will appear as numeric in the processed Calliope model input dataset.
+If all dimension data can be coerced to a numeric data type (e.g. `["10", 100, "-1"]`), then it _will_ be coerced (e.g., `[10, 100, -1]`).
+
 ## 0.7.0.dev2 (2024-01-26)
 
 v0.7 includes a major change to how Calliope internally operates.

diff --git a/docs/creating/data_sources.md b/docs/creating/data_sources.md
@@ -400,6 +400,11 @@ E.g.,
     data_sources:
       ...
     ```
+6. We process dimension data after loading it in according to a limited set of heuristics:
+    1. we assume any dimension with the suffix `steps` (e.g., `timesteps`, `monthsteps`) is timeseries data, and attempt to convert the data type of the dimension values accordingly.
+    2. We will attempt to convert dimension data to numeric values.
+    Therefore, dimensions with the data `[1, 2]`, `["1", "2"]`, `[1, "2"]`, and `["1.0", 2.0]` will all be converted to having a numeric data type (integer or float).
+    `["foo", "1"]` and `["foo", 1]` will _not_ be converted, as not all dimension data entries are convertible to numeric data types.
 
 ### Data you _cannot_ load in tabular format
 

diff --git a/src/calliope/preprocess/model_data.py b/src/calliope/preprocess/model_data.py
@@ -695,18 +695,23 @@ def _links_to_node_format(self, active_node_dict: AttrDict) -> AttrDict:
 
         return link_tech_dict
 
-    def _add_to_dataset(self, to_add: xr.Dataset, id: str):
+    def _add_to_dataset(self, to_add: xr.Dataset, id_: str):
         """Add new data to the central class dataset.
 
         Before being added, any dimensions with the `steps` suffix will be cast to datetime dtype.
 
         Args:
             to_add (xr.Dataset): Dataset to merge into the central dataset.
-            id (str): ID of dataset being added, to use in log messages
+            id_ (str): ID of dataset being added, to use in log messages
         """
-        to_add = time.timeseries_to_datetime(to_add, self.config["time_format"], id)
+        to_add_numeric_dims = self._update_numeric_dims(to_add, id_)
+        to_add_numeric_ts_dims = time.timeseries_to_datetime(
+            to_add_numeric_dims, self.config["time_format"], id_
+        )
         self.dataset = xr.merge(
-            [to_add, self.dataset], combine_attrs="no_conflicts", compat="override"
+            [to_add_numeric_ts_dims, self.dataset],
+            combine_attrs="no_conflicts",
+            compat="override",
         ).fillna(self.dataset)
 
     def _log_param_updates(self, param_name: str, param_da: xr.DataArray):
@@ -747,6 +752,33 @@ def _update_one_way_links(node_from_data: dict, node_to_data: dict):
         node_from_data.pop("carrier_out")  # cannot import carriers at the `from` node
         node_to_data.pop("carrier_in")  # cannot export carrier at the `to` node
 
+    @staticmethod
+    def _update_numeric_dims(ds: xr.Dataset, id_: str) -> xr.Dataset:
+        """Try coercing all dimension data of the input dataset to a numeric data type.
+
+        Any dimensions where _all_ its data is potentially numeric will be returned with all data coerced to numeric.
+        All other dimensions will be returned as they were in the input dataset.
+        No changes are made to data variables in the dataset.
+
+        Args:
+            ds (xr.Dataset): Dataset possibly containing numeric dimensions.
+            id_ (str): Identifier for `ds` to use in logging.
+
+        Returns:
+            xr.Dataset: Input `ds` with numeric coordinates.
+        """
+
+        for dim_name in ds.dims:
+            try:
+                ds.coords[dim_name] = pd.to_numeric(ds.coords[dim_name].to_index())
+                LOGGER.debug(
+                    f"{id_} | Updating `{dim_name}` dimension index values to numeric type."
+                )
+            except ValueError:
+                continue
+
+        return ds
+
     def _raise_error_on_transmission_tech_def(
         self, tech_def_dict: AttrDict, node_name: str
     ):

diff --git a/tests/test_preprocess_model_data.py b/tests/test_preprocess_model_data.py
@@ -702,12 +702,67 @@ def test_add_to_dataset_no_timeseries(
         new_param = simple_da.copy().to_dataset(name="non_ts_data")
         model_data_factory._add_to_dataset(new_param, "foo")
 
-        assert "foo | Updating" not in my_caplog.text
-        assert "datetime format" not in my_caplog.text
+        assert "dimension index values to datetime format" not in my_caplog.text
         # make sure nothing has changed in the array
         assert "non_ts_data" in model_data_factory.dataset
         assert model_data_factory.dataset["non_ts_data"].equals(simple_da)
 
+    @pytest.mark.parametrize(
+        ["data", "kind"],
+        [
+            ([1, 2], "i"),
+            (["1", "2"], "i"),
+            (["1", 2], "i"),
+            ([1, "2"], "i"),
+            ([1.0, 2.0], "f"),
+            (["1.0", "2.0"], "f"),
+            ([1, "2.0"], "f"),
+            (["1", 2.0], "f"),
+        ],
+    )
+    def test_update_numeric_dims(
+        self, my_caplog, model_data_factory: ModelDataFactory, data, kind
+    ):
+        new_idx = pd.Index(data, name="bar")
+        new_param = pd.DataFrame({"my_data": [True, False]}, index=new_idx).to_xarray()
+        updated_ds = model_data_factory._update_numeric_dims(new_param, "foo")
+
+        assert (
+            "foo | Updating `bar` dimension index values to numeric type"
+            in my_caplog.text
+        )
+        assert updated_ds.coords["bar"].dtype.kind == kind
+
+    @pytest.mark.parametrize(["data", "kind"], [(["1", 2], "i"), ([1.0, "2.0"], "f")])
+    def test_update_numeric_dims_in_model_data(
+        self, my_caplog, model_data_factory: ModelDataFactory, data, kind
+    ):
+        new_idx = pd.Index(data, name="bar")
+        new_param = pd.DataFrame({"num_data": [True, False]}, index=new_idx).to_xarray()
+        model_data_factory._add_to_dataset(new_param, "foo")
+
+        assert (
+            "foo | Updating `bar` dimension index values to numeric type"
+            in my_caplog.text
+        )
+        assert model_data_factory.dataset.coords["bar"].dtype.kind == kind
+
+    @pytest.mark.parametrize(
+        "data", [["foo", 2], [1.0, "foo"], ["foo", "bar"], ["Y1", "Y2"]]
+    )
+    def test_update_numeric_dims_no_update(
+        self, my_caplog, model_data_factory: ModelDataFactory, data
+    ):
+        new_idx = pd.Index(data, name="bar")
+        new_param = pd.DataFrame({"ts_data": [True, False]}, index=new_idx).to_xarray()
+        updated_ds = model_data_factory._update_numeric_dims(new_param, "foo")
+
+        assert (
+            "foo | Updating `bar` dimension index values to numeric type"
+            not in my_caplog.text
+        )
+        assert updated_ds.coords["bar"].dtype.kind not in ["f", "i"]
+
     @pytest.mark.parametrize(
         ["coords", "new_coords"],
         [(["foobar", "baz"], ["baz"]), (["bazfoo", "baz"], ["bazfoo", "baz"])],