In [9]:
import pickle
import pandas as pd
pd.set_option("display.width", 1000)

with open("../data/raw/naics_occupation.pkl", "rb") as file:
  df = pickle.load(file)

In [10]:
# Drop unnecessary cols
cols_to_drop = ["State_GEOID", "NAICS_TITLE", "OCC_TITLE"]
df = df.drop(cols_to_drop, axis=1)

In [11]:
# Create new compress naics col
def map_naics_code(naics_code):
  code_prefix = naics_code[:2]

  if code_prefix in ["31", "32", "33"]:
    return "31-33"
  else:
    return code_prefix

df["naics_2"] = df["naics"].apply(map_naics_code)

In [12]:
# Reorder Columns
col_order = ["FIPS", "naics_2", "naics", "emp_total_county_naics", "OCC_CODE", "emp_occupation"]
df = df[col_order]

In [13]:
# Remove naics we won't use
naics_codes = ["11", "21", "22", "23", "31-33"]
df = df.loc[df["naics_2"].isin(naics_codes)]

In [14]:
# Remove global (0) and unknown county (999$) fips
fips_filter = "|".join(["0", "999$"])
df = df[~df["FIPS"].astype(str).str.contains(fips_filter)]

In [15]:
# Sort by employee sum in naics in fips
df = df.sort_values(by=["emp_total_county_naics"], ascending=False)

In [16]:
# Save DataFrame to file
df = df.reset_index().drop("index", axis=1)

with open("../data/processed/naics_occupation.pkl", "wb") as handle:
  pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)