Improved join utilities, split NHD waterbody multipolygons and retain…

… only the first
brendan-ward · Dec 6, 2019 · 91ba92c · 91ba92c
1 parent 660dc16
commit 91ba92c
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 11 deletions.
diff --git a/nhdnet/nhd/extract.py b/nhdnet/nhd/extract.py
@@ -211,6 +211,11 @@ def extract_waterbodies(gdb_path, target_crs, exclude_ftypes=[], min_area=0):
         )
     )
 
+    # Convert multipolygons to polygons
+    # those we checked that are true multipolygons are errors
+    idx = df.loc[df.geometry.type == "MultiPolygon"].index
+    df.loc[idx, "geometry"] = df.loc[idx].geometry.apply(lambda g: g[0])
+
     print("Converting geometry to 2D")
     df.geometry = df.geometry.apply(poly2D)
 

diff --git a/nhdnet/nhd/joins.py b/nhdnet/nhd/joins.py
@@ -138,15 +138,34 @@ def remove_joins(df, ids, downstream_col="downstream", upstream_col="upstream"):
     [type]
         [description]
     """
-    # TODO: fix new dangling terminals?  Set to 0 first?
-    # join_df = join_df.loc[~join_df.upstream.isin(coastline_idx)].copy()
 
-    # set the downstream to 0 for any that join coastlines
-    # this will enable us to mark these as downstream terminals in
-    # the network analysis later
-    # join_df.loc[join_df.downstream.isin(coastline_idx), "downstream"] = 0
-
-    # drop any duplicates (above operation sets some joins to upstream and downstream of 0)
-    # join_df = join_df.drop_duplicates()
-
-    return df.loc[~(df[upstream_col].isin(ids) | (df[downstream_col].isin(ids)))].copy()
+    # Update any joins that would have connected to these ids
+    # on their downstream end
+    upstreams = df.loc[
+        (df[downstream_col].isin(ids)) & (df[upstream_col] != 0), upstream_col
+    ]
+    has_other_joins = df.loc[
+        df[upstream_col].isin(upstreams) & ~df[downstream_col].isin(ids), upstream_col
+    ]
+
+    # new terminals are ones that end ONLY in these ids
+    new_terminals = upstreams.loc[~upstreams.isin(has_other_joins)]
+    ix = df.loc[df[upstream_col].isin(new_terminals)].index
+    df.loc[ix, downstream_col] = 0
+
+    # Update any joins that would have connected to these ids
+    # on their upstream end
+    downstreams = df.loc[
+        df[upstream_col].isin(ids) & (df[downstream_col] != 0), downstream_col
+    ]
+    has_other_joins = df.loc[
+        df[downstream_col].isin(downstreams) & ~df[upstream_col].isin(ids),
+        downstream_col,
+    ]
+    new_terminals = downstreams.loc[~downstreams.isin(has_other_joins)]
+    ix = df.loc[df[downstream_col].isin(new_terminals)].index
+    df.loc[ix, upstream_col] = 0
+
+    return df.loc[
+        ~(df[upstream_col].isin(ids) | (df[downstream_col].isin(ids)))
+    ].drop_duplicates()