Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/graph analysis identification #28

Merged
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
449c44f
Merge branch 'feature/graph-analysis-loading' of github.com:ai4er-cdt…
Croydon-Brixton Feb 19, 2021
0e51cd6
exp: explore node identification and graph
Croydon-Brixton Feb 22, 2021
2f4f78c
feature: add concenience func to plot landcover
Croydon-Brixton Feb 22, 2021
1b757ea
Merge branch 'feature/graph-analysis' of github.com:ai4er-cdt/gtc-bio…
Croydon-Brixton Feb 23, 2021
03429a4
fix: add default values for polygonize
Croydon-Brixton Feb 23, 2021
b4a7f4b
Merge branch 'feature/graph-analysis-habitat' of github.com:ai4er-cdt…
Croydon-Brixton Feb 23, 2021
a376541
feature: add test data for node identification
Croydon-Brixton Feb 24, 2021
29d0340
feature: add utils for creating GeoGraph test data
Croydon-Brixton Feb 24, 2021
ec51cc8
feature: add polygon utils for overlap computations
Croydon-Brixton Feb 24, 2021
37a5f9a
feature: add node identification functionality
Croydon-Brixton Feb 24, 2021
cdf5e10
refactor: remove deprecated graph_tools
Croydon-Brixton Feb 24, 2021
d905ebd
feature: add node identification method to GeoGraph
Croydon-Brixton Feb 24, 2021
8f40d9e
feature: add test data for node identification
Croydon-Brixton Feb 24, 2021
75081b5
exp: add node identification demo and dev notebook
Croydon-Brixton Feb 24, 2021
8d585b1
lint: fix lint by turning unneccessary list into generator
Croydon-Brixton Feb 24, 2021
4cda25c
Merge branch 'feature/graph-analysis' of github.com:ai4er-cdt/gtc-bio…
Croydon-Brixton Feb 26, 2021
d9586e7
exp: simplified notebooks
Croydon-Brixton Feb 26, 2021
776d67c
Merge branch 'feature/graph-analysis' of github.com:ai4er-cdt/gtc-bio…
Croydon-Brixton Feb 28, 2021
878e3c0
refactor: refactor identify node method for use with underlying geopa…
Croydon-Brixton Mar 1, 2021
d223923
feature: add bulk node identification for new dataframe interface
Croydon-Brixton Mar 1, 2021
79bd345
feature: add node_id sorting for node identification
Croydon-Brixton Mar 1, 2021
acc7874
fix: fix self referential nodes in add_habitat
Croydon-Brixton Mar 1, 2021
2d8b943
Apply suggestions from code review
Croydon-Brixton Mar 3, 2021
6803470
refactor: rename poly -> polygon
Croydon-Brixton Mar 3, 2021
a8af146
refactor: create property geometry in favor of _geometry, add module …
Croydon-Brixton Mar 3, 2021
52e5378
fix: correct polygon_utils type hints
Croydon-Brixton Mar 3, 2021
636bce4
refactor: isort
Croydon-Brixton Mar 3, 2021
b1197bf
cleanup: remove old identify_nodes function
Croydon-Brixton Mar 3, 2021
308a7e9
fix: Rtree indexing and minor improvements
herbiebradley Mar 3, 2021
359c710
lint: black
herbiebradley Mar 3, 2021
7a99470
fix: add_habitat bug and perf. improvement
herbiebradley Mar 3, 2021
1d54eaa
fix: inaccurate comment
herbiebradley Mar 3, 2021
29beff4
fix: return List from `identify_node`
herbiebradley Mar 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1,409 changes: 1,409 additions & 0 deletions notebooks/exploratory/svm-10-optimise-identify-node.ipynb

Large diffs are not rendered by default.

1,976 changes: 1,976 additions & 0 deletions notebooks/exploratory/svm-7-getting-to-know-graph.ipynb

Large diffs are not rendered by default.

2,488 changes: 2,488 additions & 0 deletions notebooks/exploratory/svm-8-generate-test-cases.ipynb

Large diffs are not rendered by default.

77 changes: 76 additions & 1 deletion src/data_loading/landcover_plot_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
"""A collection of utility functions for plotting landcover datasets"""
"""A collection of utility functions for plotting landcover datasets."""
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Patch
from numba import njit

from src.constants import ESA_LANDCOVER_DIR


Expand Down Expand Up @@ -71,3 +76,73 @@ def classes_to_rgb(
rgb_data[i, j, ...] = class_to_rgb[data[i, j]]

return rgb_data


def plot_landcover(
data: np.ndarray,
ax: Optional[plt.Axes] = None,
landcover_class_df: pd.DataFrame = ESA_CCI_CLASSES,
with_legend: bool = True,
) -> None:
"""
Plot array with landcover data with colors and legend.
Convenience function to plot landcover data with labels and RGB values in the
data`lddatar_class_df`. The format of that dataframe should be:
- Rows indexed by landcover class (int)
- Cdatacontaining the substring `label` must exist for plotting with legend
- Columns with names `R`, `G`, `B` must exist with integers corresponding to RGB
values for the landcover classes.
Args:
data (np.ndarray): The landcover data to plot. Values should be the landcover
classes.
ax (Optional[pldatas], optional): matplotlib.Axes object to add the plot to an
existing canvas. Defaults to None.
landcover_class_df (Optional[pd.DataFrame], optional): A dataframe containing
the RGB values to color each class with and a class label for each class.
Defaults to ESA_CCI_CLASSES.
with_legend (bool, optional): whether to add legend with class labels.
Defaults to True.
"""

# Remove unused dimensions
data = data.squeeze()
# Check right format
assert data.ndim == 2, "`image` must be 2 dimensional"

# Create canvas to plot onto, if needed
if not ax:
_, ax = plt.subplots(figsize=(15, 15))

# Plot image
ax.imshow(
classes_to_rgb(data, class_to_rgb=_class_rgb_array_from_df(landcover_class_df))
)

# Remove axes
ax.set_yticks([])
ax.set_xticks([])

# Create legend
if with_legend:
# Automatically identify label column name
label_colname = landcover_class_df.columns[
landcover_class_df.columns.str.contains("label", case=False)
][0]

# Filter only the landcover classes that appear in the image
present_classes = np.unique(data)

# Generate legend
legend_elements = []
for _, row in landcover_class_df.loc[present_classes].iterrows():
class_name = row[label_colname]
class_rgba = row[["R", "G", "B"]].values / 255.0 # plt wants values in 0-1
legend_elements.append(Patch(label=class_name, facecolor=class_rgba))
ax.legend(
handles=legend_elements,
bbox_to_anchor=(1.01, 1), # places legend to the right of the image
loc="upper left", # anchor for the bbox_to_anchor statement
prop={"size": 18}, # fontsize in legend
)
10 changes: 5 additions & 5 deletions src/data_loading/rasterio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ def read_from_lat_lon(

def polygonise(
data_array: np.ndarray,
mask: Optional[np.ndarray],
transform: affine.Affine,
crs: Optional[str],
mask: Optional[np.ndarray] = None,
transform: affine.Affine = affine.identity,
crs: Optional[str] = None,
connectivity: int = 4,
apply_buffer: bool = True,
):
Expand Down Expand Up @@ -149,12 +149,12 @@ def polygonise(
Returns:
gpd.GeoDataFrame: GeoDataFrame containing polygon objects.
"""
poly_generator = shapes(
polygon_generator = shapes(
data_array, mask=mask, connectivity=connectivity, transform=transform
)
results = list(
{"properties": {"class_label": int(val)}, "geometry": shape}
for shape, val in poly_generator
for shape, val in polygon_generator
)
df = gpd.GeoDataFrame.from_features(results, crs=crs)
if apply_buffer:
Expand Down
2 changes: 1 addition & 1 deletion src/data_loading/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_square_row(
square_coords = [(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]
# scaling up by square_side_len
square_coords = [
tuple([coord * square_side_len for coord in coords]) for coords in square_coords
tuple(coord * square_side_len for coord in coords) for coords in square_coords
]
# setting to origin coords
square_coords = [[sum(x) for x in zip(coords, origin)] for coords in square_coords]
Expand Down
54 changes: 54 additions & 0 deletions src/models/binary_graph_operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Contains tools for binary operations between GeoGraph objects."""
from numpy import ndarray
from src.models.polygon_utils import (
connect_with_interior_bulk,
connect_with_interior_or_edge_bulk,
connect_with_interior_or_edge_or_corner_bulk,
)

# For switching identifiction mode in `identify_node`
_BULK_SPATIAL_IDENTIFICATION_FUNCTION = {
"corner": connect_with_interior_or_edge_or_corner_bulk,
"edge": connect_with_interior_or_edge_bulk,
"interior": connect_with_interior_bulk,
}


def identify_node(node: dict, other_graph: "GeoGraph", mode: str = "corner") -> ndarray:
"""
Return list of all node ids in `other_graph` which identify with the given `node`.

Args:
node (dict): The node for which to find nodes in `other_graphs` that can be
identified with `node`.
other_graph (GeoGraph): The GeoGraph object in which to search for
identifications
mode (str, optional): Must be one of `corner`, `edge` or `interior`. Defaults
to "corner".
The different modes correspond to different rules for identification:
- corner: Polygons of the same `class_label` which overlap, touch in their
edges or corners will be identified with each other. (fastest)
- edge: Polygons of the same `class_label` which overlap or touch in their
edges will be identified with each other.
- interior: Polygons of the same `class_label` which overlap will be
identified with each other. Touching corners or edges are not counted.
Croydon-Brixton marked this conversation as resolved.
Show resolved Hide resolved

Returns:
np.ndarray: List of node ids in `other_graph` which identify with `node`.
"""
# Mode switch
assert mode in ["corner", "edge", "interior"]
have_valid_overlap = _BULK_SPATIAL_IDENTIFICATION_FUNCTION[mode]

# Get potential candidates for overlap
candidate_ids = other_graph.rtree.query(node["geometry"], sort=True)
# Filter candidates according to the same class label
candidate_ids = candidate_ids[
other_graph.class_label[candidate_ids] == node["class_label"]
]
# Filter candidates accroding to correct spatial overlap
candidate_ids = candidate_ids[
have_valid_overlap(node["geometry"], other_graph.geometry[candidate_ids])
]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intuitively it should be slightly faster to combine these two filters into one, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I also think that if numpy does short-circuit evaluation on these things this should be faster. @herbiebradley I don't know if it does, do you?

Concretely: Do you know how numpy handles these type of cases?
Case select_from_array[np.logical_or(condition_array1, condition_array2)]
Does it first evaluate both condition_array1 and condition_array2 in the slice [ ... ] and then or the conditions (in which case it'd probably be slower bc we would calculate the geometry overlaps for shapes which won't agree in class label).
Or does it calculate the first element of condition_array1 and then short-circuit decide if that element of condition_array2 even needs to be calculated? (in which case I think it should be slightly faster)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about np.logical_or but if you use base python or and a generator to select then it should short-circuit and may be faster - there are some interesting timing results here on selecting from boolean arrays: https://stackoverflow.com/questions/58422690/filtering-a-numpy-array-what-is-the-best-approach

return candidate_ids
73 changes: 58 additions & 15 deletions src/models/geograph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

See https://networkx.org/documentation/stable/index.html for graph operations.
"""
from __future__ import annotations

import bz2
import gzip
import os
Expand All @@ -20,7 +22,8 @@
import rasterio
import shapely
from shapely.prepared import prep
from src.data_loading.rasterio_utils import polygonise
from src.data_loading import rasterio_utils
from src.models import binary_graph_operations
from tqdm import tqdm

VALID_EXTENSIONS = (
Expand Down Expand Up @@ -52,7 +55,7 @@ class GeoGraph:
def __init__(
self,
data,
crs: Union[str, pyproj.CRS],
crs: Optional[Union[str, pyproj.CRS]] = None,
graph_save_path: Optional[Union[str, os.PathLike]] = None,
raster_save_path: Optional[Union[str, os.PathLike]] = None,
columns_to_rename: Optional[Dict[str, str]] = None,
Expand Down Expand Up @@ -116,7 +119,7 @@ def __init__(
super().__init__()
self.graph = nx.Graph()
self._habitats: Dict[str, Habitat] = {}
self._crs: Union[str, pyproj.CRS] = crs
self._crs: Optional[Union[str, pyproj.CRS]] = crs
self._columns_to_rename: Optional[Dict[str, str]] = columns_to_rename
self._tolerance: float = tolerance

Expand Down Expand Up @@ -193,6 +196,22 @@ def crs(self):
"""Return crs of dataframe."""
return self.df.crs

@property
def class_label(self):
"""Return class label of nodes directly from underlying numpy array.

Note: Uses `iloc` type indexing.
"""
return self.df.class_label.values

@property
def geometry(self):
"""Return geometry of nodes from underlying numpy array.

Note: Uses `iloc` type indexing.
"""
return self.df.geometry.values

Croydon-Brixton marked this conversation as resolved.
Show resolved Hide resolved
def _load_from_vector_path(
self,
vector_path: pathlib.Path,
Expand Down Expand Up @@ -269,12 +288,13 @@ def _load_from_raster(
Returns:
gpd.GeoDataFrame: The dataframe containing polygon objects.
"""
vector_df = polygonise(data_array=data_array, **raster_kwargs)
vector_df = rasterio_utils.polygonise(data_array=data_array, **raster_kwargs)
if save_path is not None:
if save_path.suffix == ".gpkg":
vector_df.to_file(save_path, driver="GPKG")
else:
vector_df.to_file(save_path, driver="ESRI Shapefile")
save_path.chmod(0o664)
return self._load_from_dataframe(vector_df, tolerance=self._tolerance)

def _load_from_graph_path(self, graph_path: pathlib.Path) -> gpd.GeoDataFrame:
Expand Down Expand Up @@ -319,6 +339,7 @@ def _save_graph(self, save_path: pathlib.Path) -> None:
else:
with open(save_path, "wb") as file:
pickle.dump(data, file)
save_path.chmod(0o664)

def _load_from_dataframe(
self,
Expand Down Expand Up @@ -403,6 +424,8 @@ def _load_from_dataframe(
if polygon_id != neighbour_id:
self.graph.add_edge(polygon_id, neighbour_id)

# add index name
df.index.name = "node_index"
return df

def merge_nodes(
Expand Down Expand Up @@ -502,17 +525,26 @@ def add_habitat(
# Remove all edges in the graph, then at the end we only have edges
# between nodes less than `max_travel_distance` apart
hgraph.clear_edges()
# Get dict to convert between iloc indexes and loc indexes
# These are different only if nodes have been removed from the df
idx_dict: Dict[int, int] = dict(zip(range(len(self.df)), self.df.index.values))
# Get lists of polygons and buff polygons to avoid repeatedly querying
# the dataframe
polygons: List[shapely.Polygon] = self.df.geometry.tolist()
# the dataframe. These lists accept loc indexes
polygons: Dict[int, shapely.Polygon] = self.df.geometry.to_dict()
if max_travel_distance > 0:
# Vectorised buffer on the entire df to calculate the expanded polygons
# used to get intersections.
buff_polygons = self.df.geometry.buffer(max_travel_distance).tolist()
buff_polygons = self.df.geometry.buffer(max_travel_distance).to_dict()
# Remove non-habitat nodes from habitat graph
invalid_idx = set(
self.df.loc[~self.df["class_label"].isin(set(valid_classes))].index
)
# np.where is very fast here and gets the iloc based indexes
# Combining it with the set comprehension reduces time by an order of
# magnitude compared to `set(self.df.loc()`
invalid_idx = {
idx_dict[i]
for i in np.where(~self.df["class_label"].isin(set(valid_classes)).values)[
0
]
}
hgraph.remove_nodes_from(invalid_idx)

for node in tqdm(
Expand All @@ -527,16 +559,20 @@ def add_habitat(
buff_poly = prep(polygon)
# Query rtree for all polygons within `max_travel_distance` of the original
for nbr in self.rtree.intersection(buff_poly_bounds):
# Necessary to correct for the rtree returning iloc indexes
nbr = idx_dict[nbr]
# If a node is not a habitat class node, don't add the edge
if nbr in invalid_idx:
if nbr == node or nbr in invalid_idx:
continue
# Otherwise add the edge with distance attribute
nbr_polygon = polygons[nbr]
if not hgraph.has_edge(node, nbr) and buff_poly.intersects(nbr_polygon):
if add_distance:
hgraph.add_edge(
node, nbr, distance=polygon.distance(nbr_polygon)
)
if max_travel_distance == 0:
dist = 0.0
else:
dist = polygon.distance(nbr_polygon)
hgraph.add_edge(node, nbr, distance=dist)
else:
hgraph.add_edge(node, nbr)
# Add habitat to habitats dict
Expand Down Expand Up @@ -586,6 +622,13 @@ def get_graph_components(
graph components.
"""
components: List[set] = list(nx.connected_components(graph))
geom = [self.df.geometry.iloc[list(comp)].unary_union for comp in components]
geom = [self.df.geometry.loc[comp].unary_union for comp in components]
gdf = gpd.GeoDataFrame({"geometry": geom}, crs=self.df.crs)
return gdf, components

def identify_node(
self, node_id: int, other_graph: GeoGraph, mode: str
) -> List[int]:
Croydon-Brixton marked this conversation as resolved.
Show resolved Hide resolved
return binary_graph_operations.identify_node(
self.df.loc[node_id], other_graph=other_graph, mode=mode
)