Skip to content

Commit

Permalink
Coerce model dimension dtypes to numeric (#575)
Browse files Browse the repository at this point in the history
  • Loading branch information
brynpickering committed Feb 29, 2024
1 parent 0240bcc commit 41f9060
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 6 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ Both the index and the values of the timeseries (both being date strings) should

|changed| `inbuilt` math -> `pre-defined` math and `custom` math -> `pre-defined` math in the documentation.

|fixed| Dimensions with numeric data can be defined in tabular data _or_ YAML and will appear as numeric in the processed Calliope model input dataset.
If all dimension data can be coerced to a numeric data type (e.g. `["10", 100, "-1"]`), then it _will_ be coerced (e.g., `[10, 100, -1]`).

## 0.7.0.dev2 (2024-01-26)

v0.7 includes a major change to how Calliope internally operates.
Expand Down
5 changes: 5 additions & 0 deletions docs/creating/data_sources.md
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,11 @@ E.g.,
data_sources:
...
```
6. We process dimension data after loading it in according to a limited set of heuristics:
1. we assume any dimension with the suffix `steps` (e.g., `timesteps`, `monthsteps`) is timeseries data, and attempt to convert the data type of the dimension values accordingly.
2. We will attempt to convert dimension data to numeric values.
Therefore, dimensions with the data `[1, 2]`, `["1", "2"]`, `[1, "2"]`, and `["1.0", 2.0]` will all be converted to having a numeric data type (integer or float).
`["foo", "1"]` and `["foo", 1]` will _not_ be converted, as not all dimension data entries are convertible to numeric data types.

### Data you _cannot_ load in tabular format

Expand Down
40 changes: 36 additions & 4 deletions src/calliope/preprocess/model_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,18 +695,23 @@ def _links_to_node_format(self, active_node_dict: AttrDict) -> AttrDict:

return link_tech_dict

def _add_to_dataset(self, to_add: xr.Dataset, id: str):
def _add_to_dataset(self, to_add: xr.Dataset, id_: str):
"""Add new data to the central class dataset.
Before being added, any dimensions with the `steps` suffix will be cast to datetime dtype.
Args:
to_add (xr.Dataset): Dataset to merge into the central dataset.
id (str): ID of dataset being added, to use in log messages
id_ (str): ID of dataset being added, to use in log messages
"""
to_add = time.timeseries_to_datetime(to_add, self.config["time_format"], id)
to_add_numeric_dims = self._update_numeric_dims(to_add, id_)
to_add_numeric_ts_dims = time.timeseries_to_datetime(
to_add_numeric_dims, self.config["time_format"], id_
)
self.dataset = xr.merge(
[to_add, self.dataset], combine_attrs="no_conflicts", compat="override"
[to_add_numeric_ts_dims, self.dataset],
combine_attrs="no_conflicts",
compat="override",
).fillna(self.dataset)

def _log_param_updates(self, param_name: str, param_da: xr.DataArray):
Expand Down Expand Up @@ -747,6 +752,33 @@ def _update_one_way_links(node_from_data: dict, node_to_data: dict):
node_from_data.pop("carrier_out") # cannot import carriers at the `from` node
node_to_data.pop("carrier_in") # cannot export carrier at the `to` node

@staticmethod
def _update_numeric_dims(ds: xr.Dataset, id_: str) -> xr.Dataset:
"""Try coercing all dimension data of the input dataset to a numeric data type.
Any dimensions where _all_ its data is potentially numeric will be returned with all data coerced to numeric.
All other dimensions will be returned as they were in the input dataset.
No changes are made to data variables in the dataset.
Args:
ds (xr.Dataset): Dataset possibly containing numeric dimensions.
id_ (str): Identifier for `ds` to use in logging.
Returns:
xr.Dataset: Input `ds` with numeric coordinates.
"""

for dim_name in ds.dims:
try:
ds.coords[dim_name] = pd.to_numeric(ds.coords[dim_name].to_index())
LOGGER.debug(
f"{id_} | Updating `{dim_name}` dimension index values to numeric type."
)
except ValueError:
continue

return ds

def _raise_error_on_transmission_tech_def(
self, tech_def_dict: AttrDict, node_name: str
):
Expand Down
59 changes: 57 additions & 2 deletions tests/test_preprocess_model_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,12 +702,67 @@ def test_add_to_dataset_no_timeseries(
new_param = simple_da.copy().to_dataset(name="non_ts_data")
model_data_factory._add_to_dataset(new_param, "foo")

assert "foo | Updating" not in my_caplog.text
assert "datetime format" not in my_caplog.text
assert "dimension index values to datetime format" not in my_caplog.text
# make sure nothing has changed in the array
assert "non_ts_data" in model_data_factory.dataset
assert model_data_factory.dataset["non_ts_data"].equals(simple_da)

@pytest.mark.parametrize(
["data", "kind"],
[
([1, 2], "i"),
(["1", "2"], "i"),
(["1", 2], "i"),
([1, "2"], "i"),
([1.0, 2.0], "f"),
(["1.0", "2.0"], "f"),
([1, "2.0"], "f"),
(["1", 2.0], "f"),
],
)
def test_update_numeric_dims(
self, my_caplog, model_data_factory: ModelDataFactory, data, kind
):
new_idx = pd.Index(data, name="bar")
new_param = pd.DataFrame({"my_data": [True, False]}, index=new_idx).to_xarray()
updated_ds = model_data_factory._update_numeric_dims(new_param, "foo")

assert (
"foo | Updating `bar` dimension index values to numeric type"
in my_caplog.text
)
assert updated_ds.coords["bar"].dtype.kind == kind

@pytest.mark.parametrize(["data", "kind"], [(["1", 2], "i"), ([1.0, "2.0"], "f")])
def test_update_numeric_dims_in_model_data(
self, my_caplog, model_data_factory: ModelDataFactory, data, kind
):
new_idx = pd.Index(data, name="bar")
new_param = pd.DataFrame({"num_data": [True, False]}, index=new_idx).to_xarray()
model_data_factory._add_to_dataset(new_param, "foo")

assert (
"foo | Updating `bar` dimension index values to numeric type"
in my_caplog.text
)
assert model_data_factory.dataset.coords["bar"].dtype.kind == kind

@pytest.mark.parametrize(
"data", [["foo", 2], [1.0, "foo"], ["foo", "bar"], ["Y1", "Y2"]]
)
def test_update_numeric_dims_no_update(
self, my_caplog, model_data_factory: ModelDataFactory, data
):
new_idx = pd.Index(data, name="bar")
new_param = pd.DataFrame({"ts_data": [True, False]}, index=new_idx).to_xarray()
updated_ds = model_data_factory._update_numeric_dims(new_param, "foo")

assert (
"foo | Updating `bar` dimension index values to numeric type"
not in my_caplog.text
)
assert updated_ds.coords["bar"].dtype.kind not in ["f", "i"]

@pytest.mark.parametrize(
["coords", "new_coords"],
[(["foobar", "baz"], ["baz"]), (["bazfoo", "baz"], ["bazfoo", "baz"])],
Expand Down

0 comments on commit 41f9060

Please sign in to comment.