In [1]:
import pickle
import pandas as pd
import numpy as np
pd.set_option("display.width", 1000)

In [2]:
with open("../data/raw/naics_occupation.pkl", "rb") as file:
  df = pickle.load(file)

with open("../data/processed/gdp.pkl", "rb") as file:
  gdp = pickle.load(file)

In [3]:
cols_to_drop = ["State_GEOID"]

# FIPS filter for "globals" and unknown counties
fips_filter = "|".join(["^0", "0$", "999$"])

# NAICS filter for intereseting codes for our purpose
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])

In [4]:
# Drop unnecessary cols
df = df.drop(cols_to_drop, axis=1)

# Drop FIPS
df = df.loc[df["FIPS"] < 90000]
df = df.loc[~df["FIPS"].astype(str).str.contains(fips_filter)]

# Drop NAICS
df = df.loc[df["naics"].str.contains(naics_filter)]

In [5]:
# Create new compress naics col
def map_naics_code(naics_code):
  code_prefix = int(naics_code[:3])

  if code_prefix < 310: 
    return str(code_prefix)[:2]
  else:
    if code_prefix == 321 or ( 327 <= code_prefix <= 339):
      return "321,327-339"
    elif (311 <=code_prefix <= 316) or (322 <= code_prefix <= 326):
      return "311-316,322-326"
    else:
      return "31-33"
  
df["naics_2"] = df["naics"].apply(map_naics_code)

In [6]:
df_conc = pd.merge(df, gdp, left_on=["FIPS", "naics_2"], right_on=["FIPS", "naics"], how="left")

df_conc = df_conc.drop(["naics_y"], axis=1)

In [7]:
df_conc.rename(columns={"naics_x": "naics"}, inplace=True)

df_conc = df_conc[["FIPS", "GeoName", "naics", "NAICS_TITLE", "naics_2", "Description", "emp_total_county_naics", "OCC_CODE", "OCC_TITLE", "emp_occupation", "gdp", "gdp_naics", "gdp_fips"]]

In [9]:
with open("../data/processed/occupation_gdp.pkl", "wb") as handle:
  pickle.dump(df_conc, handle, protocol=pickle.HIGHEST_PROTOCOL)