In [318]:
import pickle
import pandas as pd
pd.set_option('display.width', 1000)

with open("../data/raw/gdp.pkl", "rb") as file:
  df = pickle.load(file)

In [319]:
# Rename naics for uniformity
df.rename(columns={"IndustryClassification": "naics", "2022": "gdp"}, inplace=True)

# Define redundant cols
redundant_cols = ["Region", "TableName", "LineCode", "Unit"]
years = ["2017", "2018", "2019", "2020", "2021"]
cols_to_drop = redundant_cols + years

# FIPS filter for "globals" and unknown counties
fips_filter = "|".join(["^0", "0$", "999$"])

# NAICS filter for intereseting codes for our purpose
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])

In [320]:
# Drop cols
df = df.drop(cols_to_drop, axis=1)

# Drop FIPS
df = df.loc[df["FIPS"] < 90000]
df = df.loc[~df["FIPS"].astype(str).str.contains(fips_filter)]

# Drop NAICS
df = df.loc[df["naics"].str.contains(naics_filter)]

In [321]:
# Group for naics and fips
df_group_naics = df.groupby(["naics"])["gdp"].sum()
df_group_fips = df.groupby(["FIPS"])["gdp"].sum()

# Series to DataFrame
df_group_naics = df_group_naics.reset_index()
df_group_fips = df_group_fips.reset_index()

# Rename the corresponding gdp cols
df_group_naics.rename(columns={"gdp": "gdp_naics"}, inplace=True)
df_group_fips.rename(columns={"gdp": "gdp_fips"}, inplace=True)

In [325]:
# Join new frames
df_conc = pd.merge(df, df_group_naics, on="naics", how="inner")
df_conc = pd.merge(df_conc, df_group_fips, on="FIPS", how="inner")

In [326]:
with open("../data/processed/gdp.pkl", "wb") as handle:
  pickle.dump(df_conc, handle, protocol=pickle.HIGHEST_PROTOCOL)