Merge pull request #28 from ai4er-cdt/feature/graph-analysis-identifi…

…cation Feature/graph analysis identification
ai4er-cdt · Mar 3, 2021 · 7d937c4 · 7d937c4
2 parents de5470b + 29beff4
commit 7d937c4
Show file tree

Hide file tree

Showing 28 changed files with 6,465 additions and 103 deletions.
diff --git a/notebooks/exploratory/svm-10-optimise-identify-node.ipynb b/notebooks/exploratory/svm-10-optimise-identify-node.ipynb
diff --git a/notebooks/exploratory/svm-7-getting-to-know-graph.ipynb b/notebooks/exploratory/svm-7-getting-to-know-graph.ipynb
diff --git a/notebooks/exploratory/svm-8-generate-test-cases.ipynb b/notebooks/exploratory/svm-8-generate-test-cases.ipynb
diff --git a/src/data_loading/landcover_plot_utils.py b/src/data_loading/landcover_plot_utils.py
@@ -1,7 +1,12 @@
-"""A collection of utility functions for plotting landcover datasets"""
+"""A collection of utility functions for plotting landcover datasets."""
+from typing import Optional
+
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from matplotlib.patches import Patch
 from numba import njit
+
 from src.constants import ESA_LANDCOVER_DIR
 
 
@@ -71,3 +76,73 @@ def classes_to_rgb(
             rgb_data[i, j, ...] = class_to_rgb[data[i, j]]
 
     return rgb_data
+
+
+def plot_landcover(
+    data: np.ndarray,
+    ax: Optional[plt.Axes] = None,
+    landcover_class_df: pd.DataFrame = ESA_CCI_CLASSES,
+    with_legend: bool = True,
+) -> None:
+    """
+    Plot array with landcover data with colors and legend.
+
+    Convenience function to plot landcover data with labels and RGB values in the
+    data`lddatar_class_df`. The format of that dataframe should be:
+        - Rows indexed by landcover class (int)
+        - Cdatacontaining the substring `label` must exist for plotting with legend
+        - Columns with names `R`, `G`, `B` must exist with integers corresponding to RGB
+          values for the landcover classes.
+
+    Args:
+        data (np.ndarray): The landcover data to plot. Values should be the landcover
+            classes.
+        ax (Optional[pldatas], optional): matplotlib.Axes object to add the plot to an
+            existing canvas. Defaults to None.
+        landcover_class_df (Optional[pd.DataFrame], optional): A dataframe containing
+            the RGB values to color each class with and a class label for each class.
+            Defaults to ESA_CCI_CLASSES.
+        with_legend (bool, optional): whether to add legend with class labels.
+            Defaults to True.
+    """
+
+    # Remove unused dimensions
+    data = data.squeeze()
+    # Check right format
+    assert data.ndim == 2, "`image` must be 2 dimensional"
+
+    # Create canvas to plot onto, if needed
+    if not ax:
+        _, ax = plt.subplots(figsize=(15, 15))
+
+    # Plot image
+    ax.imshow(
+        classes_to_rgb(data, class_to_rgb=_class_rgb_array_from_df(landcover_class_df))
+    )
+
+    # Remove axes
+    ax.set_yticks([])
+    ax.set_xticks([])
+
+    # Create legend
+    if with_legend:
+        # Automatically identify label column name
+        label_colname = landcover_class_df.columns[
+            landcover_class_df.columns.str.contains("label", case=False)
+        ][0]
+
+        # Filter only the landcover classes that appear in the image
+        present_classes = np.unique(data)
+
+        # Generate legend
+        legend_elements = []
+        for _, row in landcover_class_df.loc[present_classes].iterrows():
+            class_name = row[label_colname]
+            class_rgba = row[["R", "G", "B"]].values / 255.0  # plt wants values in 0-1
+            legend_elements.append(Patch(label=class_name, facecolor=class_rgba))
+        ax.legend(
+            handles=legend_elements,
+            bbox_to_anchor=(1.01, 1),  # places legend to the right of the image
+            loc="upper left",  # anchor for the bbox_to_anchor statement
+            prop={"size": 18},  # fontsize in legend
+        )
diff --git a/src/data_loading/rasterio_utils.py b/src/data_loading/rasterio_utils.py
@@ -115,9 +115,9 @@ def read_from_lat_lon(
 
 def polygonise(
     data_array: np.ndarray,
-    mask: Optional[np.ndarray],
-    transform: affine.Affine,
-    crs: Optional[str],
+    mask: Optional[np.ndarray] = None,
+    transform: affine.Affine = affine.identity,
+    crs: Optional[str] = None,
     connectivity: int = 4,
     apply_buffer: bool = True,
 ):
@@ -149,12 +149,12 @@ def polygonise(
     Returns:
         gpd.GeoDataFrame: GeoDataFrame containing polygon objects.
     """
-    poly_generator = shapes(
+    polygon_generator = shapes(
         data_array, mask=mask, connectivity=connectivity, transform=transform
     )
     results = list(
         {"properties": {"class_label": int(val)}, "geometry": shape}
-        for shape, val in poly_generator
+        for shape, val in polygon_generator
     )
     df = gpd.GeoDataFrame.from_features(results, crs=crs)
     if apply_buffer:

diff --git a/src/data_loading/test_data.py b/src/data_loading/test_data.py
@@ -43,7 +43,7 @@ def get_square_row(
     square_coords = [(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]
     # scaling up by square_side_len
     square_coords = [
-        tuple([coord * square_side_len for coord in coords]) for coords in square_coords
+        tuple(coord * square_side_len for coord in coords) for coords in square_coords
     ]
     # setting to origin coords
     square_coords = [[sum(x) for x in zip(coords, origin)] for coords in square_coords]

diff --git a/src/models/binary_graph_operations.py b/src/models/binary_graph_operations.py
@@ -0,0 +1,54 @@
+"""Contains tools for binary operations between GeoGraph objects."""
+from numpy import ndarray
+from src.models.polygon_utils import (
+    connect_with_interior_bulk,
+    connect_with_interior_or_edge_bulk,
+    connect_with_interior_or_edge_or_corner_bulk,
+)
+
+# For switching identifiction mode in `identify_node`
+_BULK_SPATIAL_IDENTIFICATION_FUNCTION = {
+    "corner": connect_with_interior_or_edge_or_corner_bulk,
+    "edge": connect_with_interior_or_edge_bulk,
+    "interior": connect_with_interior_bulk,
+}
+
+
+def identify_node(node: dict, other_graph: "GeoGraph", mode: str = "corner") -> ndarray:
+    """
+    Return list of all node ids in `other_graph` which identify with the given `node`.
+
+    Args:
+        node (dict): The node for which to find nodes in `other_graphs` that can be
+            identified with `node`.
+        other_graph (GeoGraph): The GeoGraph object in which to search for
+            identifications
+        mode (str, optional): Must be one of `corner`, `edge` or `interior`. Defaults
+            to "corner".
+            The different modes correspond to different rules for identification:
+            - corner: Polygons of the same `class_label` which overlap, touch in their
+                edges or corners will be identified with each other. (fastest)
+            - edge: Polygons of the same `class_label` which overlap or touch in their
+                edges will be identified with each other.
+            - interior: Polygons of the same `class_label` which overlap will be
+                identified with each other. Touching corners or edges are not counted.
+
+    Returns:
+        np.ndarray: List of node ids in `other_graph` which identify with `node`.
+    """
+    # Mode switch
+    assert mode in ["corner", "edge", "interior"]
+    have_valid_overlap = _BULK_SPATIAL_IDENTIFICATION_FUNCTION[mode]
+
+    # Get potential candidates for overlap
+    candidate_ids = other_graph.rtree.query(node["geometry"], sort=True)
+    # Filter candidates according to the same class label
+    candidate_ids = candidate_ids[
+        other_graph.class_label[candidate_ids] == node["class_label"]
+    ]
+    # Filter candidates accroding to correct spatial overlap
+    candidate_ids = candidate_ids[
+        have_valid_overlap(node["geometry"], other_graph.geometry[candidate_ids])
+    ]
+
+    return candidate_ids
diff --git a/src/models/geograph.py b/src/models/geograph.py
@@ -3,6 +3,8 @@
 
 See https://networkx.org/documentation/stable/index.html for graph operations.
 """
+from __future__ import annotations
+
 import bz2
 import gzip
 import os
@@ -20,7 +22,8 @@
 import rasterio
 import shapely
 from shapely.prepared import prep
-from src.data_loading.rasterio_utils import polygonise
+from src.data_loading import rasterio_utils
+from src.models import binary_graph_operations
 from tqdm import tqdm
 
 VALID_EXTENSIONS = (
@@ -52,7 +55,7 @@ class GeoGraph:
     def __init__(
         self,
         data,
-        crs: Union[str, pyproj.CRS],
+        crs: Optional[Union[str, pyproj.CRS]] = None,
         graph_save_path: Optional[Union[str, os.PathLike]] = None,
         raster_save_path: Optional[Union[str, os.PathLike]] = None,
         columns_to_rename: Optional[Dict[str, str]] = None,
@@ -116,7 +119,7 @@ def __init__(
         super().__init__()
         self.graph = nx.Graph()
         self._habitats: Dict[str, Habitat] = {}
-        self._crs: Union[str, pyproj.CRS] = crs
+        self._crs: Optional[Union[str, pyproj.CRS]] = crs
         self._columns_to_rename: Optional[Dict[str, str]] = columns_to_rename
         self._tolerance: float = tolerance
 
@@ -193,6 +196,22 @@ def crs(self):
         """Return crs of dataframe."""
         return self.df.crs
 
+    @property
+    def class_label(self):
+        """Return class label of nodes directly from underlying numpy array.
+
+        Note: Uses `iloc` type indexing.
+        """
+        return self.df.class_label.values
+
+    @property
+    def geometry(self):
+        """Return geometry of nodes from underlying numpy array.
+
+        Note: Uses `iloc` type indexing.
+        """
+        return self.df.geometry.values
+
     def _load_from_vector_path(
         self,
         vector_path: pathlib.Path,
@@ -269,12 +288,13 @@ def _load_from_raster(
         Returns:
             gpd.GeoDataFrame: The dataframe containing polygon objects.
         """
-        vector_df = polygonise(data_array=data_array, **raster_kwargs)
+        vector_df = rasterio_utils.polygonise(data_array=data_array, **raster_kwargs)
         if save_path is not None:
             if save_path.suffix == ".gpkg":
                 vector_df.to_file(save_path, driver="GPKG")
             else:
                 vector_df.to_file(save_path, driver="ESRI Shapefile")
+            save_path.chmod(0o664)
         return self._load_from_dataframe(vector_df, tolerance=self._tolerance)
 
     def _load_from_graph_path(self, graph_path: pathlib.Path) -> gpd.GeoDataFrame:
@@ -319,6 +339,7 @@ def _save_graph(self, save_path: pathlib.Path) -> None:
         else:
             with open(save_path, "wb") as file:
                 pickle.dump(data, file)
+        save_path.chmod(0o664)
 
     def _load_from_dataframe(
         self,
@@ -403,6 +424,8 @@ def _load_from_dataframe(
                 if polygon_id != neighbour_id:
                     self.graph.add_edge(polygon_id, neighbour_id)
 
+        # add index name
+        df.index.name = "node_index"
         return df
 
     def merge_nodes(
@@ -502,17 +525,26 @@ def add_habitat(
         # Remove all edges in the graph, then at the end we only have edges
         # between nodes less than `max_travel_distance` apart
         hgraph.clear_edges()
-        # Get lists of polygons and buff polygons to avoid repeatedly querying
-        # the dataframe
-        polygons: List[shapely.Polygon] = self.df.geometry.tolist()
+        # Get dict to convert between iloc indexes and loc indexes
+        # These are different only if nodes have been removed from the df
+        idx_dict: Dict[int, int] = dict(zip(range(len(self.df)), self.df.index.values))
+        # Get dicts of polygons and buff polygons to avoid repeatedly querying
+        # the dataframe. These dicts accept loc indexes
+        polygons: Dict[int, shapely.Polygon] = self.df.geometry.to_dict()
         if max_travel_distance > 0:
             # Vectorised buffer on the entire df to calculate the expanded polygons
             # used to get intersections.
-            buff_polygons = self.df.geometry.buffer(max_travel_distance).tolist()
+            buff_polygons = self.df.geometry.buffer(max_travel_distance).to_dict()
         # Remove non-habitat nodes from habitat graph
-        invalid_idx = set(
-            self.df.loc[~self.df["class_label"].isin(set(valid_classes))].index
-        )
+        # np.where is very fast here and gets the iloc based indexes
+        # Combining it with the set comprehension reduces time by an order of
+        # magnitude compared to `set(self.df.loc()`
+        invalid_idx = {
+            idx_dict[i]
+            for i in np.where(~self.df["class_label"].isin(set(valid_classes)).values)[
+                0
+            ]
+        }
         hgraph.remove_nodes_from(invalid_idx)
 
         for node in tqdm(
@@ -527,16 +559,20 @@ def add_habitat(
                 buff_poly = prep(polygon)
             # Query rtree for all polygons within `max_travel_distance` of the original
             for nbr in self.rtree.intersection(buff_poly_bounds):
+                # Necessary to correct for the rtree returning iloc indexes
+                nbr = idx_dict[nbr]
                 # If a node is not a habitat class node, don't add the edge
-                if nbr in invalid_idx:
+                if nbr == node or nbr in invalid_idx:
                     continue
                 # Otherwise add the edge with distance attribute
                 nbr_polygon = polygons[nbr]
                 if not hgraph.has_edge(node, nbr) and buff_poly.intersects(nbr_polygon):
                     if add_distance:
-                        hgraph.add_edge(
-                            node, nbr, distance=polygon.distance(nbr_polygon)
-                        )
+                        if max_travel_distance == 0:
+                            dist = 0.0
+                        else:
+                            dist = polygon.distance(nbr_polygon)
+                        hgraph.add_edge(node, nbr, distance=dist)
                     else:
                         hgraph.add_edge(node, nbr)
         # Add habitat to habitats dict
@@ -586,6 +622,13 @@ def get_graph_components(
             graph components.
         """
         components: List[set] = list(nx.connected_components(graph))
-        geom = [self.df.geometry.iloc[list(comp)].unary_union for comp in components]
+        geom = [self.df.geometry.loc[comp].unary_union for comp in components]
         gdf = gpd.GeoDataFrame({"geometry": geom}, crs=self.df.crs)
         return gdf, components
+
+    def identify_node(
+        self, node_id: int, other_graph: GeoGraph, mode: str
+    ) -> List[int]:
+        return binary_graph_operations.identify_node(
+            self.df.loc[node_id], other_graph=other_graph, mode=mode
+        ).tolist()