In [None]:
import pickle
import pandas as pd

pd.set_option('display.max_columns', None)

In [None]:
with open("../data/raw/naics_pattern.pkl", "rb") as file:
  df = pickle.load(file)

with open("../data/processed/occupation_gdp.pkl", "rb") as file:
  occ_gdp = pickle.load(file)

In [None]:
cols_to_drop = ["State_GEOID", "County_GEOID", "DESCRIPTION", "qp1_nf", "qp1"]

# FIPS filter for "globals" and unknown counties
fips_filter = "|".join(["^0", "0$", "999$"])

# NAICS filter for intereseting codes for our purpose
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])

In [None]:
# Drop unnecessary cols
df = df.drop(cols_to_drop, axis=1)

# Drop FIPS
df = df.loc[df["FIPS"] < 90000]
df = df.loc[~df["FIPS"].astype(str).str.contains(fips_filter)]

# Drop NAICS
df = df.loc[df["naics"].str.contains(naics_filter)]

In [None]:
# Create new compress naics col
def map_naics_code(naics_code):
  code_prefix = int(naics_code[:3])

  if code_prefix < 310: 
    return str(code_prefix)[:2]
  else:
    if code_prefix == 321 or ( 327 <= code_prefix <= 339):
      return "321,327-339"
    elif (311 <=code_prefix <= 316) or (322 <= code_prefix <= 326):
      return "311-316,322-326"
    else:
      return "31-33"
  
df["naics_2"] = df["naics"].apply(map_naics_code)

In [None]:
df = df.drop(["naics_2"], axis=1)

df_conc = pd.merge(occ_gdp, df, on=["FIPS", "naics"], how="left")

In [None]:
df_conc = df_conc.drop(["GeoName", "NAICS_TITLE", "Description", "OCC_TITLE", "emp_nf", "ap_nf"], axis=1)
df_conc = df_conc.sort_values(by=["FIPS", "naics"], ascending=True)