# Pre-Processing
## Imports

In [54]:
import pickle
import pandas as pd
import numpy as np

pd.set_option("display.width", 1000)

In [55]:
with open("../data/raw/gdp.pkl", "rb") as file:
  gdp = pickle.load(file)

with open("../data/raw/naics_occupation.pkl", "rb") as file:
  occ = pickle.load(file)

with open("../data/raw/naics_pattern.pkl", "rb") as file:
  patterns = pickle.load(file)

## Define Droppings

In [56]:
# Cols to drop
gdp_cols_to_drop = ["Region", "TableName", "LineCode", "Unit", "2017", "2018", "2019", "2020", "2021"] 
occ_cols_to_drop = ["State_GEOID", "NAICS_TITLE"]
pattern_cols_to_drop = ["State_GEOID", "County_GEOID", "DESCRIPTION", "qp1_nf", "qp1"]

# Establish filters
fips_filter = "|".join(["^0", "0$", "999$"])
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])

## Strip Data

In [57]:
# Drop cols
gdp = gdp.drop(gdp_cols_to_drop, axis=1)
occ = occ.drop(occ_cols_to_drop, axis=1)
patterns = patterns.drop(pattern_cols_to_drop, axis=1)

# Drop unnecessary FIPS
gdp = gdp.loc[gdp["FIPS"] < 90000]
gdp = gdp.loc[~gdp["FIPS"].astype(str).str.contains(fips_filter)]

occ = occ.loc[occ["FIPS"] < 90000]
occ = occ.loc[~occ["FIPS"].astype(str).str.contains(fips_filter)]

patterns = patterns.loc[patterns["FIPS"] < 90000]
patterns = patterns.loc[~patterns["FIPS"].astype(str).str.contains(fips_filter)]

# Rename columns for uniformity
gdp = gdp.rename(columns={"IndustryClassification": "naics", "2022": "gdp"})

# Drop uninteresting NAICS
gdp = gdp.loc[gdp["naics"].str.contains(naics_filter)]
gdp = gdp.loc[~gdp["naics"].isin(["11,21", "22,48-49", "31-33,51"])]
occ = occ.loc[occ["naics"].str.contains(naics_filter)]
patterns = patterns.loc[patterns["naics"].str.contains(naics_filter)]

## Create new cols
### Compress

In [58]:
# Compress NAICS functions
def map_naics_code(naics_code):
  code_prefix = int(naics_code[:3])

  if code_prefix < 310: 
    return str(code_prefix)[:2]
  else:
    if code_prefix == 321 or ( 327 <= code_prefix <= 339):
      return "321,327-339"
    elif (311 <=code_prefix <= 316) or (322 <= code_prefix <= 326):
      return "311-316,322-326"
    else:
      return "31-33"

# Apply function
occ["naics_2"] = occ["naics"].apply(map_naics_code)
patterns["naics_2"] = patterns["naics"].apply(map_naics_code)

### Group gdp

In [59]:
# Group for naics and fips
gdp_naics = gdp.groupby(["naics"])["gdp"].sum()
gdp_fips = gdp.groupby(["FIPS"])["gdp"].sum()

# Series to DataFrame
gdp_naics = gdp_naics.reset_index()
gdp_fips = gdp_fips.reset_index()

# Rename the corresponding gdp cols
gdp_naics.rename(columns={"gdp": "gdp_naics"}, inplace=True)
gdp_fips.rename(columns={"gdp": "gdp_fips"}, inplace=True)

# Merge gdp cols
gdp = pd.merge(gdp, gdp_naics, on="naics", how="inner")
gdp = pd.merge(gdp, gdp_fips, on="FIPS", how="inner")

## Save DataFrames

In [60]:
with open("../data/processed/gdp.pkl", "wb") as handle:
  pickle.dump(gdp, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("../data/processed/occupation.pkl", "wb") as handle:
  pickle.dump(occ, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("../data/processed/patterns.pkl", "wb") as handle:
  pickle.dump(patterns, handle, protocol=pickle.HIGHEST_PROTOCOL)