# Data Aggregation & Summaries

Use this Jupyter Notebook to take our initial dataset(s) and generate aggregate/reformatted/summarized CSVs usable by our frontend.

In [1]:
# imports and initial variables
import pandas as pd
import numpy as np

dataset_name = "census_tracts_processed.csv"
dataset_df = pd.read_csv(dataset_name)

In [2]:
# create aggregate info for each county
# GEOID consists of 11 digits, first 2 are state and next 3 are county
# matches county ID in map
df = dataset_df.copy()
df["id"] = df["geoid"].astype(str).str[:5].astype(int)

# calculate aggregates
def mean_pos(x):
    pos_vals = x[x > 0]
    return np.round(np.mean(pos_vals), 2)

def listify(x):
    return list(set(x))
    
agg = {
    "median_income": mean_pos,
    "median_home_value": mean_pos,
    "name": listify,
    "city": lambda x: listify(x)[0],
    "metro_area": lambda x: listify(x)[0],
}
agg_non_default = agg.keys()
agg.update({
    col: "sum" for col in df.columns if col not in [*agg_non_default, "id", "geoid"]
})
agg_df = df.groupby("id").agg(agg).reset_index()

# for tracts in agg_df["name"]:
#     county = None
#     state = None
#     for tract in tracts:
#         s = tract.split(", ")
#         if county is None:
#             county = s[1]
#             state = s[2]
#         else:
#             assert county == s[1] and state == s[2], "diff county detected! " + county + state

# extract data from name (county/state is clean, ie only one exists per name)
agg_df["name"] = agg_df["name"].str[0]
agg_df["tract_count"] = len(agg_df["name"])
agg_df[["name", "county", "state"]] = agg_df["name"].str.split(", ", expand=True)
agg_df = agg_df.drop(columns=["name"])

# recalculate proportion 25 under
agg_df["proportion_25_under"] = agg_df["total_population_25_under"] / agg_df["total_population"]

# reorder columns
agg_df.insert(5, "tract_count", agg_df.pop("tract_count"))
agg_df.insert(5, "state", agg_df.pop("state"))
agg_df.insert(3, "county", agg_df.pop("county"))

# convert whole number fields to ints
int_cols = ["total_population",
       "total_population_25_over", "educational_attainment", "white_alone",
       "black_alone", "native_alone", "asian_alone",
       "native_hawaiian_pacific_islander", "some_other_race_alone",
       "two_or_more", "hispanic_or_latino", "total_population_25_under"]
agg_df[int_cols] = agg_df[int_cols].astype(int)

# save df
agg_df.to_csv("county_aggregated.csv", index=False)

# get json
agg_df.to_json("county_aggregated.json", orient="records")