Added waterbody extract, added more flowline join utils

brendan-ward · Dec 5, 2019 · 660dc16 · 660dc16
1 parent 014c1d5
commit 660dc16
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 54 deletions.
diff --git a/nhdnet/geometry/points.py b/nhdnet/geometry/points.py
@@ -4,6 +4,23 @@
 from shapely.geometry import Point
 
 
+def to2D(geometry):
+    """Flatten a 3D point to 2D.
+
+    Parameters
+    ----------
+    geometry : Point
+        Input 3D geometry
+
+    Returns
+    -------
+    Point
+        Output 2D geometry
+    """
+
+    return Point(geometry.x, geometry.y)
+
+
 def create_points(df, x_column, y_column, crs):
     """Create a GeoDataFrame from pandas DataFrame
 

diff --git a/nhdnet/io.py b/nhdnet/io.py
@@ -188,29 +188,8 @@ def to_shp(df, path):
     # Drop any records with missing geometries
     df = df.loc[~df[geom_col].isnull()].copy()
 
-    # Convert data types to those supported by shapefile
-    for c in [c for c, t in df.dtypes.items() if t == "uint64"]:
-        df[c] = df[c].astype("float64")
-
     df.to_file(path)
 
-    ### Original implementation, now slower than geopandas to_file()
-    # geometry = df[geom_col].apply(mapping)
-    # # fill missing data with None and convert to dict
-    # props = df.drop(columns=[df._geometry_column_name])
-    # props.replace({c: {np.nan: None} for c in prop_cols}, inplace=True)
-    # props = props.apply(lambda row: row.to_dict(), axis=1)
-    # # Convert features to JSON
-    # features = DataFrame({"geometry": geometry, "properties": props})
-    # features["type"] = "Feature"
-    # features = features.apply(lambda row: row.to_dict(), axis=1)
-    # schema = infer_schema(df)
-    # with fiona.Env():
-    #     with fiona.open(
-    #         path, "w", driver="ESRI Shapefile", crs=df.crs, schema=schema
-    #     ) as writer:
-    #         writer.writerecords(features)
-
 
 def serialize_sindex(df, path):
     """Serialize the bounding coordinates necessary to recreate a spatial index

diff --git a/nhdnet/nhd/extract.py b/nhdnet/nhd/extract.py
@@ -23,20 +23,24 @@
 import geopandas as gp
 import pandas as pd
 from shapely.geometry import MultiLineString
-from nhdnet.geometry.lines import to2D, calculate_sinuosity
+from nhdnet.geometry.lines import to2D as line2D, calculate_sinuosity
+from nhdnet.geometry.polygons import to2D as poly2D
 from nhdnet.nhd.joins import index_joins, find_joins
 
 FLOWLINE_COLS = ["NHDPlusID", "FlowDir", "FType", "GNIS_ID", "GNIS_Name", "geometry"]
 
 # TODO: add elevation gradient info
 VAA_COLS = ["NHDPlusID", "StreamOrde", "StreamCalc", "TotDASqKm"]
 
+WATERBODY_COLS = ["NHDPlusID", "FType", "AreaSqKm", "geometry"]
+
 
 def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
     """
-    Extracts data from NHDPlusHR data product.
-    Extract flowlines, join to VAA table, and filter out any loops and coastlines.
-    Extract joins between flowlines, and filter out any loops and coastlines.
+    Extracts flowlines data from NHDPlusHR data product.
+    Extract flowlines from NHDPlusHR data product, joins to VAA table,
+    and filters out coastlines.
+    Extracts joins between flowlines, and filters out coastlines.
 
     Parameters
     ----------
@@ -50,7 +54,7 @@ def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
 
     Returns
     -------
-    type of geopandas.GeoDataFrame
+    tuple of (GeoDataFrame, DataFrame)
         (flowlines, joins)
     """
 
@@ -88,9 +92,6 @@ def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
     join_df.upstream = join_df.upstream.astype("uint64")
     join_df.downstream = join_df.downstream.astype("uint64")
 
-    # build join index
-    joins = index_joins(join_df)
-
     ### Label loops for easier removal later, if we need to
     # WARNING: loops may be very problematic from a network processing standpoint.
     # Include with caution.
@@ -159,7 +160,7 @@ def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
 
     # Convert incoming data from XYZM to XY
     print("Converting geometry to 2D")
-    df.geometry = df.geometry.apply(to2D)
+    df.geometry = df.geometry.apply(line2D)
 
     print("projecting to target projection")
     df = df.to_crs(target_crs)
@@ -177,3 +178,50 @@ def extract_flowlines(gdb_path, target_crs, extra_flowline_cols=[]):
 
     return df, join_df
 
+
+def extract_waterbodies(gdb_path, target_crs, exclude_ftypes=[], min_area=0):
+    """Extract waterbodies from NHDPlusHR data product.
+
+    Parameters
+    ----------
+    gdb_path : str
+        path to the NHD HUC4 Geodatabase
+    target_crs: GeoPandas CRS object
+        target CRS to project NHD to for analysis, like length calculations.
+        Must be a planar projection.
+    exclude_ftypes : list, optional (default: [])
+        list of FTypes to exclude.
+    min_area : int, optional (default: 0)
+        If provided, only waterbodies that are >= this value are retained
+
+    Returns
+    -------
+    GeoDataFrame
+    """
+    print("Reading waterbodies")
+    df = gp.read_file(gdb_path, layer="NHDWaterbody")[WATERBODY_COLS]
+    print("Read {:,} waterbodies".format(len(df)))
+
+    df = df.loc[
+        (df.AreaSqKm >= min_area) & (~df.FType.isin(exclude_ftypes))
+    ].reset_index(drop=True)
+    print(
+        "Retained {:,} waterbodies after dropping those below size threshold or in exclude FTypes".format(
+            len(df)
+        )
+    )
+
+    print("Converting geometry to 2D")
+    df.geometry = df.geometry.apply(poly2D)
+
+    print("projecting to target projection")
+    df = df.to_crs(target_crs)
+
+    df.NHDPlusID = df.NHDPlusID.astype("uint64")
+    df.AreaSqKm = df.AreaSqKm.astype("float32")
+    df.FType = df.FType.astype("uint16")
+
+    ### Add calculated fields
+    df["wbID"] = df.index.values.astype("uint32") + 1
+
+    return df
diff --git a/nhdnet/nhd/joins.py b/nhdnet/nhd/joins.py
@@ -1,4 +1,4 @@
-def find_join(df, id, upstream="upstream", downstream="downstream"):
+def find_join(df, id, downstream_col="downstream", upstream_col="upstream"):
     """Find the joins for a given segment id in a joins table.
 
     Parameters
@@ -7,19 +7,19 @@ def find_join(df, id, upstream="upstream", downstream="downstream"):
         data frame containing the joins
     id : any
         id to lookup in upstream or downstream columns
-    upstream : str, optional (default "upstream")
-        name of upstream column
-    downstream : str, optional (default "downstream")
+    downstream_col : str, optional (default "downstream")
         name of downstream column
+    upstream_col : str, optional (default "upstream")
+        name of upstream column
 
     Returns
     -------
     Joins that have the id as an upstream or downstream.
     """
-    return df.loc[(df[upstream] == id) | (df[downstream] == id)]
+    return df.loc[(df[upstream_col] == id) | (df[downstream_col] == id)]
 
 
-def find_joins(df, ids, upstream="upstream", downstream="downstream"):
+def find_joins(df, ids, downstream_col="downstream", upstream_col="upstream"):
     """Find the joins for a given segment id in a joins table.
 
     Parameters
@@ -28,26 +28,19 @@ def find_joins(df, ids, upstream="upstream", downstream="downstream"):
         data frame containing the joins
     ids : list-like
         ids to lookup in upstream or downstream columns
-    upstream : str, optional (default "upstream")
-        name of upstream column
-    downstream : str, optional (default "downstream")
+    downstream_col : str, optional (default "downstream")
         name of downstream column
+    upstream_col : str, optional (default "upstream")
+        name of upstream column
 
     Returns
     -------
     Joins that have the id as an upstream or downstream.
     """
-    return df.loc[(df[upstream].isin(ids)) | (df[downstream].isin(ids))]
-
+    return df.loc[(df[upstream_col].isin(ids)) | (df[downstream_col].isin(ids))]
 
-# def find_singles(df, upstream="upstream", downstream="downstream"):
-#     return df.loc[
-#         ((df[upstream] == 0) & (df[downstream != 0]))
-#         | ((df[downstream] == 0) & (df[upstream] != 0))
-#     ]
 
-
-def index_joins(df, upstream="upstream", downstream="downstream"):
+def index_joins(df, downstream_col="downstream", upstream_col="upstream"):
     """Create an index of joins based on a given segment id.
     Returns a dataframe indexed by id, listing the id of the next
     segment upstream and the id of the next segment downstream.
@@ -61,22 +54,99 @@ def index_joins(df, upstream="upstream", downstream="downstream"):
     ----------
     df : DataFrame
         data frame containing the joins.
-    upstream : str, optional (default "upstream")
-        upstream column name
-    downstream : str, optional (default "downstream")
+    downstream_col : str, optional (default "downstream")
         downstream column name
+    upstream_col : str, optional (default "upstream")
+        upstream column name
 
     Returns
     -------
     DataFrame
     """
 
     return (
-        df.loc[df[downstream] != 0]
-        .set_index(downstream)[[upstream]]
-        .join(df.set_index(upstream)[[downstream]])
+        df.loc[df[downstream_col] != 0]
+        .set_index(downstream_col)[[upstream_col]]
+        .join(df.set_index(upstream_col)[[downstream_col]])
         .reset_index()
         .drop_duplicates()
         .set_index("index")
     )
 
+
+def create_upstream_index(
+    df, downstream_col="downstream", upstream_col="upstream", exclude=None
+):
+    """Create an index of downstream ids to all their respective upstream ids.
+    This is so that network traversal can start from a downstream-most segment,
+    and then traverse upward for all segments that have that as a downstream segment.
+
+    Parameters
+    ----------
+    df : DataFrame
+        Data frame containing the pairs of upstream_col and downstream_col that
+        represent the joins between segments.
+    downstream_col : str, optional (default "downstream")
+        Name of column containing downstream ids
+    upstream_col : str, optional (default "upstream")
+        Name of column containing upstream ids
+    exclude : list-like, optional (default None)
+        List-like containing segment ids to exclude from the list of upstreams.
+        For example, barriers that break the network should be in this list.
+        Otherwise, network traversal will operate from the downstream-most point
+        to all upstream-most points, which can be very large for some networks.
+
+    Returns
+    -------
+    dict
+        dictionary of downstream_id to the corresponding upstream_id(s)
+    """
+
+    ix = (df[upstream_col] != 0) & (df[downstream_col] != 0)
+
+    if exclude is not None:
+        ix = ix & (~df[upstream_col].isin(exclude))
+
+    # NOTE: this looks backward but is correct for the way that grouping works.
+    return (
+        df[ix, [downstream_col, upstream_col]]
+        .set_index(upstream_col)
+        .groupby(downstream_col)
+        .groups
+    )
+
+
+def remove_joins(df, ids, downstream_col="downstream", upstream_col="upstream"):
+    """Remove any joins to or from ids.
+    This sets any joins that terminate downstream to one of these ids to 0 in order to mark them
+    as new downstream terminals.  A join that includes other downstream ids not in ids will be left as is.
+
+    Parameters
+    ----------
+    df : DataFrame
+        Data frame containing the pairs of upstream_col and downstream_col that
+        represent the joins between segments.
+    ids : list-like
+        List of ids to remove from the joins
+    downstream_col : str, optional (default "downstream")
+        Name of column containing downstream ids
+    upstream_col : str, optional (default "upstream")
+        Name of column containing upstream ids
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    # TODO: fix new dangling terminals?  Set to 0 first?
+    # join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy()
+
+    # set the downstream to 0 for any that join coastlines
+    # this will enable us to mark these as downstream terminals in
+    # the network analysis later
+    # join_df.loc[join_df.downstream.isin(coastline_idx), "downstream"] = 0
+
+    # drop any duplicates (above operation sets some joins to upstream and downstream of 0)
+    # join_df = join_df.drop_duplicates()
+
+    return df.loc[~(df[upstream_col].isin(ids) | (df[downstream_col].isin(ids)))].copy()