# Data Preprocessing

## Preperation

Import packages and set globals

In [361]:
import pandas as pd
import numpy as np

pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

Importing the pickle files

In [362]:
gdp_df = pd.read_pickle("../data/raw/gdp.pkl")
occ_df = pd.read_pickle("../data/raw/naics_occupation.pkl")
ptn_df = pd.read_pickle("../data/raw/naics_pattern.pkl")

top_df = pd.read_pickle("../data/processed/top_picks.pkl")

## Filtering

Filters are based on previous research within the DataFrames (Step 2)

Establish filter:

- NAIC
- Rough NAICS (for gdp)
- FIPS
- OCC

In [363]:
naics_filter = top_df["naics"]
naics_rough_filter = ["321,327-339"]

fips_filter_inverse = ["0$", "999$"]
fips_filter_inverse = "|".join(fips_filter_inverse)

occ_filter = top_df["occ"]

Apply filters

In [364]:
# NAICS
gdp_df = gdp_df[gdp_df["IndustryClassification"].isin(naics_rough_filter)]
occ_df = occ_df[occ_df["naics"].isin(naics_filter)]
ptn_df = ptn_df[ptn_df["naics"].isin(naics_filter)]

# FIPS
gdp_df = gdp_df[~gdp_df["FIPS"].astype(str).str.contains(fips_filter_inverse)]
occ_df = occ_df[~occ_df["FIPS"].astype(str).str.contains(fips_filter_inverse)]
ptn_df = ptn_df[~ptn_df["FIPS"].astype(str).str.contains(fips_filter_inverse)]

# OCC
occ_df = occ_df[occ_df["OCC_CODE"].isin(occ_filter)]

## Drop Features

Establish arrays of features that should be dropped

In [365]:
cols_to_drop_gdp = ["GeoName", "Region", "TableName", "LineCode", "Description", "Unit"]
cols_to_drop_occ = ["State_GEOID", "NAICS_TITLE", "OCC_TITLE"]
cols_to_drop_pat = ["State_GEOID", "County_GEOID", "naics_2", "DESCRIPTION", "emp_nf", "emp", "qp1_nf", "qp1", "ap_nf", "n<5", "n5_9", "n10_19", "n20_49", "n50_99", "n100_249", "n250_499", "n500_999", "n1000", "n1000_1", "n1000_2", "n1000_3", "n1000_4"]

In [366]:
gdp_df = gdp_df.drop(cols_to_drop_gdp, axis=1)
occ_df = occ_df.drop(cols_to_drop_occ, axis=1)
ptn_df = ptn_df.drop(cols_to_drop_pat, axis=1)

## Feature Engineering

In [367]:
master_df = pd.DataFrame()

In [368]:
gdp_df = gdp_df.sort_values(by="FIPS", ascending=True).reset_index(drop=True)
occ_df = occ_df.sort_values(by="FIPS", ascending=True).reset_index(drop=True)
ptn_df = ptn_df.sort_values(by="FIPS", ascending=True).reset_index(drop=True)

### FIPS

In [369]:
master_df["FIPS"] = gdp_df["FIPS"]

### OCC Employment

#### For NAICS1

In [370]:
for index, element in enumerate(top_df["occ"]): 
  occ_emp = occ_df[occ_df["OCC_CODE"] == element]
  occ_emp = occ_emp[occ_emp["naics"] == top_df["naics"][0]]

  master_df = pd.merge(master_df, occ_emp[["FIPS", "emp_occupation"]], on="FIPS", how="left")
  master_df = master_df.rename({"emp_occupation": f"naics1_occ{index+1}"}, axis=1)

#### For NAICS2-5

In [371]:
occ_emp_rest = occ_df[occ_df["naics"] != top_df["naics"][0]]
occ_emp_rest = occ_emp_rest.groupby("FIPS")["emp_occupation"].sum().reset_index()

master_df = pd.merge(master_df, occ_emp_rest, on="FIPS", how="left")
master_df = master_df.rename({"emp_occupation": "naics2-5_occ1-5"}, axis=1)

### Establishments

#### For NAICS1

In [372]:
est_naicsx = ptn_df[ptn_df["naics"] == top_df["naics"][0]]

master_df = pd.merge(master_df, est_naicsx[["FIPS", "est"]], on="FIPS", how="left")
master_df = master_df.rename({"est": "naics1_est"}, axis=1)

#### For NAICS2-3

In [373]:
est_naicsx = ptn_df[ptn_df["naics"].isin(top_df["naics"][1:3])]
est_naicsx = est_naicsx.groupby("FIPS")["est"].sum().reset_index()

master_df = pd.merge(master_df, est_naicsx[["FIPS", "est"]], on="FIPS", how="left")
master_df = master_df.rename({"est": "naics2-3_est"}, axis=1)

#### For NAICS4-5

In [374]:
est_naicsx = ptn_df[ptn_df["naics"].isin(top_df["naics"][4:])]
est_naicsx = est_naicsx.groupby("FIPS")["est"].sum().reset_index()

master_df = pd.merge(master_df, est_naicsx[["FIPS", "est"]], on="FIPS", how="left")
master_df = master_df.rename({"est": "naics4-5_est"}, axis=1)

### Payment

#### For NAIC1

In [375]:
ap_naicsx = ptn_df[ptn_df["naics"] == top_df["naics"][0]]

master_df = pd.merge(master_df, ap_naicsx[["FIPS", "ap"]], on="FIPS", how="left")
master_df = master_df.rename({"ap": "naics1_ap"}, axis=1)

#### For NAIC2

In [376]:
ap_naicsx = ptn_df[ptn_df["naics"] == top_df["naics"][1]]

master_df = pd.merge(master_df, ap_naicsx[["FIPS", "ap"]], on="FIPS", how="left")
master_df = master_df.rename({"ap": "naics2_ap"}, axis=1)

#### For NAIC3-5

In [377]:
ap_naicsx = ptn_df[ptn_df["naics"].isin(top_df["naics"][2:])]
ap_naicsx = ap_naicsx.groupby("FIPS")["ap"].sum().reset_index()

master_df = pd.merge(master_df, ap_naicsx[["FIPS", "ap"]], on="FIPS", how="left")
master_df = master_df.rename({"ap": "naics3-5_ap"}, axis=1)

### Mean GDP

In [378]:
gdp_df["mean_gdp"] = gdp_df[["2017", "2018", "2019", "2020", "2021", "2022"]].mean(axis=1)

master_df = pd.merge(master_df, gdp_df[["FIPS", "mean_gdp"]], on="FIPS", how="left")

## Cleanup

In [379]:
master_df = master_df.fillna(0)

In [380]:
cols_to_round = master_df.columns[1:7]
master_df[cols_to_round] = np.ceil(master_df[cols_to_round])

In [381]:
cols_to_change_dtype = master_df.columns[1:13]
master_df[cols_to_change_dtype] = master_df[cols_to_change_dtype].astype(int)

In [383]:
master_df = master_df.set_index("FIPS")

In [385]:
master_df.head(10)

Unnamed: 0_level_0,naics1_occ1,naics1_occ2,naics1_occ3,naics1_occ4,naics1_occ5,naics2-5_occ1-5,naics1_est,naics2-3_est,naics4-5_est,naics1_ap,naics2_ap,naics3-5_ap,mean_gdp
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1001,0,0,0,0,0,0,0,0,0,0,0,0,75548.67
1003,0,0,0,0,0,171,0,6,4,0,0,76744,409094.0
1005,0,0,0,0,0,0,0,0,0,0,0,0,96016.67
1007,0,0,0,0,0,0,0,0,0,0,0,0,49192.17
1009,0,0,0,0,0,0,0,0,0,0,0,0,48779.83
1011,0,0,0,0,0,0,0,0,0,0,0,0,905.0
1013,0,0,0,0,0,0,0,0,0,0,0,0,165634.33
1015,0,0,0,0,0,0,0,0,0,0,0,0,820894.83
1017,0,0,0,0,0,239,0,0,0,0,0,38175,215207.17
1019,0,0,0,0,0,0,0,0,0,0,0,0,48297.17


## Export

In [None]:
pd.to_pickle(master_df, "../data/processed/master_df.pkl")