In the terminal: `cd ..` to be in the `_shared_utils` sub-folder, run `python setup.py install`, then `cd` back into your task sub-folder (e.g., `bus_service_increase` or `example_report`)

In [1]:
import geopandas as gpd
import pandas as pd

import shared_utils

## Aggregation

In [2]:
TABULAR_DATA = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/taxis.csv"
df = pd.read_csv(TABULAR_DATA)
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [3]:
group_cols = ["pickup_borough", "payment"]
sum_cols = ["tip", "tolls"]
mean_cols = ["fare"]
count_cols = ["pickup"]
nunique_cols = ["pickup_zone"]

by_borough = shared_utils.geography_utils.aggregate_by_geography(
    df[df.pickup_borough.notna()], 
    group_cols=group_cols,
    sum_cols = sum_cols,
    mean_cols = mean_cols,
    count_cols = count_cols,
    nunique_cols = nunique_cols
)

by_borough

Unnamed: 0,pickup_borough,payment,tip,tolls,fare,pickup,pickup_zone
0,Manhattan,credit card,10217.55,643.42,11.480182,3839.0,60.0
1,Manhattan,cash,0.0,103.68,10.273085,1397.0,63.0
2,Manhattan,,,,,,
3,Queens,cash,0.0,235.97,19.069549,266.0,24.0
4,Queens,credit card,1997.32,881.63,29.237755,383.0,46.0
5,Bronx,credit card,14.71,77.04,24.904189,74.0,33.0
6,Brooklyn,credit card,370.11,109.44,18.875402,261.0,48.0
7,Brooklyn,cash,0.0,0.0,11.10084,119.0,28.0
8,Brooklyn,,,,,,
9,Queens,,,,,,


## Merging geometry back on after aggregation

In [4]:
GEOSPATIAL_DATA = "https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=GeoJSON"
gdf = gpd.read_file(GEOSPATIAL_DATA)
gdf = gdf.rename(columns = {"boro_name": "pickup_borough"})
gdf

Unnamed: 0,boro_code,pickup_borough,shape_area,shape_leng,geometry
0,2,Bronx,1187193513.84,463868.937681,"MULTIPOLYGON (((-73.89681 40.79581, -73.89694 ..."
1,1,Manhattan,636575908.328,359804.890798,"MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ..."
2,5,Staten Island,1623635734.34,325929.794027,"MULTIPOLYGON (((-74.05051 40.56642, -74.05047 ..."
3,3,Brooklyn,1934174208.45,728195.197649,"MULTIPOLYGON (((-73.86327 40.58388, -73.86381 ..."
4,4,Queens,3040205398.17,900270.512975,"MULTIPOLYGON (((-73.82645 40.59053, -73.82642 ..."


In [5]:
by_borough2 = shared_utils.geography_utils.attach_tract_geometry(
    by_borough,
    gdf,
    merge_col = "pickup_borough",
    join = "inner"
)

print(f"# obs in original df: {len(by_borough)}")
print(f"# obs in merged df: {len(by_borough2)}")

# obs in original df: 11
# obs in merged df: 11
