# Data Preprocessing


## Preperation

Import packages and set globals


In [87]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

In [88]:
gdp_df = pd.read_pickle("../data/raw/gdp.pkl")
occ_df = pd.read_pickle("../data/raw/naics_occupation.pkl")
ptn_df = pd.read_pickle("../data/raw/naics_pattern.pkl")

top_df = pd.read_pickle("../data/processed/top_picks.pkl")

In [89]:
top_df

Unnamed: 0,naics,occ
0,3363,51-4031
1,3364,51-4121
2,3327,51-4031
3,3330A1,51-4081
4,3320A2,17-2112


## Filtering

Filters are based on previous research within the DataFrames (Step 2)

Establish filter:

- NAIC
- Rough NAICS (for gdp)
- FIPS
- OCC


In [90]:
naics_filter = top_df["naics"]
naics_rough_filter = ["321,327-339"]

fips_filter_inverse = ["0$"]
fips_filter_inverse = "|".join(fips_filter_inverse)

occ_filter = top_df["occ"]

Apply filters


In [91]:
# NAICS
gdp_df = gdp_df[gdp_df["IndustryClassification"].isin(naics_rough_filter)]
occ_df = occ_df[occ_df["naics"].isin(naics_filter)]
ptn_df = ptn_df[ptn_df["naics"].isin(naics_filter)]

# FIPS
gdp_df = gdp_df[~gdp_df["FIPS"].astype(str).str.contains(fips_filter_inverse)]
occ_df = occ_df[~occ_df["FIPS"].astype(str).str.contains(fips_filter_inverse)]
ptn_df = ptn_df[~ptn_df["FIPS"].astype(str).str.contains(fips_filter_inverse)]

# OCC
occ_df = occ_df[occ_df["OCC_CODE"].isin(occ_filter)]

### Drop Features


In [92]:
cols_to_drop_gdp = ["GeoName", "Region", "TableName", "LineCode", "Description", "Unit"]
cols_to_drop_occ = ["State_GEOID", "NAICS_TITLE", "OCC_TITLE"]
cols_to_drop_pat = [
    "State_GEOID",
    "County_GEOID",
    "naics_2",
    "DESCRIPTION",
    "emp_nf",
    "emp",
    "qp1_nf",
    "qp1",
    "ap_nf",
    "n<5",
    "n5_9",
    "n10_19",
    "n20_49",
    "n50_99",
    "n100_249",
    "n250_499",
    "n500_999",
    "n1000",
    "n1000_1",
    "n1000_2",
    "n1000_3",
    "n1000_4",
]

In [93]:
gdp_df = gdp_df.drop(cols_to_drop_gdp, axis=1)
occ_df = occ_df.drop(cols_to_drop_occ, axis=1)
ptn_df = ptn_df.drop(cols_to_drop_pat, axis=1)

### Aggregate Duplicate Data


In [94]:
ptn_df = ptn_df.groupby(["FIPS", "naics"])[["ap", "est"]].sum().reset_index()

## Feature Engineering


In [95]:
master_df = pd.DataFrame()

In [96]:
gdp_df = gdp_df.sort_values(by="FIPS", ascending=True).reset_index(drop=True)
occ_df = occ_df.sort_values(by="FIPS", ascending=True).reset_index(drop=True)
ptn_df = ptn_df.sort_values(by="FIPS", ascending=True).reset_index(drop=True)

### FIPS


In [97]:
# FIPS
master_df["FIPS"] = gdp_df["FIPS"].unique()

### OCC


In [98]:
# OCCs NAICS1
for index, element in enumerate(top_df["occ"]):
    occ_emp = occ_df[occ_df["OCC_CODE"] == element]
    occ_emp = occ_emp[occ_emp["naics"] == top_df["naics"][0]]

    master_df = pd.merge(
        master_df, occ_emp[["FIPS", "emp_occupation"]], on="FIPS", how="left"
    )
    master_df = master_df.rename({"emp_occupation": f"naics1_occ{index+1}"}, axis=1)

In [99]:
occ_rest = occ_df[occ_df["naics"] != top_df["naics"][0]]

# 1-3OCCs NAICS2-5
occ_emp_rest = occ_rest[occ_rest["OCC_CODE"].isin(top_df["occ"][:3])]
occ_emp_rest = occ_emp_rest.groupby("FIPS")["emp_occupation"].sum().reset_index()

master_df = pd.merge(master_df, occ_emp_rest, on="FIPS", how="left")
master_df = master_df.rename({"emp_occupation": "naics2-5_occ1-3"}, axis=1)

In [100]:
# 4-5OCCs NAICS2-5
occ_emp_rest = occ_rest[occ_rest["OCC_CODE"].isin(top_df["occ"][3:])]
occ_emp_rest = occ_emp_rest.groupby("FIPS")["emp_occupation"].sum().reset_index()

master_df = pd.merge(master_df, occ_emp_rest, on="FIPS", how="left")
master_df = master_df.rename({"emp_occupation": "naics2-5_occ4-5"}, axis=1)

### Est


In [101]:
# ESTs NAICS1
est_naicsx = ptn_df[ptn_df["naics"] == top_df["naics"][0]]

master_df = pd.merge(master_df, est_naicsx[["FIPS", "est"]], on="FIPS", how="left")
master_df = master_df.rename({"est": "naics1_est"}, axis=1)

# ESTs NAICS2-5
est_naicsx = ptn_df[ptn_df["naics"].isin(top_df["naics"][1:])]
est_naicsx = est_naicsx.groupby("FIPS")["est"].mean().reset_index()

master_df = pd.merge(master_df, est_naicsx[["FIPS", "est"]], on="FIPS", how="left")
master_df = master_df.rename({"est": "naics2-5_est"}, axis=1)

### Payment


In [102]:
# AP NAICS1
ap_naicsx = ptn_df[ptn_df["naics"] == top_df["naics"][0]]

master_df = pd.merge(master_df, ap_naicsx[["FIPS", "ap"]], on="FIPS", how="left")
master_df = master_df.rename({"ap": "naics1_ap"}, axis=1)

# AP NAICS2-5
ap_naicsx = ptn_df[ptn_df["naics"].isin(top_df["naics"][1:])]
ap_naicsx = ap_naicsx.groupby("FIPS")["ap"].mean().reset_index()

master_df = pd.merge(master_df, ap_naicsx[["FIPS", "ap"]], on="FIPS", how="left")
master_df = master_df.rename({"ap": "naics2-5_ap"}, axis=1)

### GDP


In [103]:
# MEAN GDP
gdp_df["newest_gdp"] = (
    gdp_df[["2017", "2018", "2019", "2020", "2021", "2022"]].ffill(axis=1).iloc[:, -1]
)

master_df = pd.merge(master_df, gdp_df[["FIPS", "newest_gdp"]], on="FIPS", how="left")

### Cleanup


In [104]:
# Remove NAs
master_df = master_df.fillna(0)

# Round Employment Numbers
cols_to_round = master_df.columns[1:7]
master_df[cols_to_round] = np.ceil(master_df[cols_to_round])

# Set FIPS as indicator
master_df = master_df.set_index("FIPS")

# Change Types for Uniformity
cols_to_change_dtype = master_df.columns
master_df[cols_to_change_dtype] = master_df[cols_to_change_dtype].astype(int)

## Norm/Scale


In [105]:
scaler = StandardScaler()

master_df_scaled = master_df.copy()

master_df_scaled[master_df.columns] = scaler.fit_transform(master_df[master_df.columns])

## Export


In [106]:
print(master_df)

       naics1_occ1  naics1_occ2  naics1_occ3  naics1_occ4  naics1_occ5  naics2-5_occ1-3  naics2-5_occ4-5  naics1_est  naics2-5_est  naics1_ap  naics2-5_ap  newest_gdp
FIPS                                                                                                                                                                  
1001             0            0            0            0            0                0                0           0             0          0            0       72611
1003             0            0            0            0            0              115              133           0             9          0        22116      487152
1005             0            0            0            0            0                0                0           0             0          0            0       85451
1007             0            0            0            0            0                2                0           0             3          0          835       6376

In [107]:
pd.to_pickle(master_df, "../data/processed/master_df.pkl")
pd.to_pickle(master_df_scaled, "../data/processed/master_df_scaled.pkl")