Skip to content

Commit

Permalink
geo_utils rm aggregate func
Browse files Browse the repository at this point in the history
  • Loading branch information
amandaha8 committed Sep 25, 2023
1 parent 9dfb7c1 commit 4e69be0
Showing 1 changed file with 1 addition and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Utility functions for geospatial data.
Some functions for dealing with census tract or other geographic unit dfs.
"""
from typing import Literal, Sequence, Union, cast
from typing import Literal, Union, cast

import dask.dataframe as dd
import geopandas as gpd # type: ignore
Expand All @@ -18,77 +18,6 @@
SQ_FT_PER_SQ_MI = 2.788 * 10**7


def aggregate_by_geography(
df: Union[pd.DataFrame, gpd.GeoDataFrame],
group_cols: list,
sum_cols: Sequence,
mean_cols: Sequence,
count_cols: Sequence,
nunique_cols: Sequence,
rename_cols: bool = False,
) -> pd.DataFrame:
"""
df: pandas.DataFrame or geopandas.GeoDataFrame.,
The df on which the aggregating is done.
If it's a geodataframe, it must exclude the tract's geometry column
group_cols: list.
List of columns to do the groupby, but exclude geometry.
sum_cols: list.
List of columns to calculate a sum with the groupby.
mean_cols: list.
List of columns to calculate an average with the groupby
(beware: may want weighted averages and not simple average!!).
count_cols: list.
List of columns to calculate a count with the groupby.
nunique_cols: list.
List of columns to calculate the number of unique values with the groupby.
rename_cols: boolean.
Defaults to False. If True, will rename columns in sum_cols to have suffix `_sum`,
rename columns in mean_cols to have suffix `_mean`, etc.
Returns a pandas.DataFrame or geopandas.GeoDataFrame (same as input).
"""
final_df = df[group_cols].drop_duplicates().reset_index()

def aggregate_and_merge(
df: Union[pd.DataFrame, gpd.GeoDataFrame],
final_df: pd.DataFrame,
group_cols: list,
agg_cols: Sequence,
aggregate_function: str,
):
agg_df = df.pivot_table(
index=group_cols, values=agg_cols, aggfunc=aggregate_function
).reset_index()

# https://stackoverflow.com/questions/34049618/how-to-add-a-suffix-or-prefix-to-each-column-name
# Why won't .add_prefix or .add_suffix work?
if rename_cols:
for c in agg_cols:
agg_df = agg_df.rename(columns={c: f"{c}_{aggregate_function}"})

final_df = pd.merge(final_df, agg_df, on=group_cols, how="left", validate="1:1")

return final_df

if len(sum_cols) > 0:
final_df = aggregate_and_merge(df, final_df, group_cols, sum_cols, "sum")

if len(mean_cols) > 0:
final_df = aggregate_and_merge(df, final_df, group_cols, mean_cols, "mean")

if len(count_cols) > 0:
final_df = aggregate_and_merge(df, final_df, group_cols, count_cols, "count")

if len(nunique_cols) > 0:
final_df = aggregate_and_merge(
df, final_df, group_cols, nunique_cols, "nunique"
)

return final_df.drop(columns="index")


# Laurie's example: https://github.com/cal-itp/data-analyses/blob/752eb5639771cb2cd5f072f70a06effd232f5f22/gtfs_shapes_geo_examples/example_shapes_geo_handling.ipynb
# have to convert to linestring
def make_linestring(x: str) -> shapely.geometry.LineString:
Expand Down

0 comments on commit 4e69be0

Please sign in to comment.