In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path

In [2]:
def bls_employment(bls_employment_xlsx):
    # Read source data file to DataFrame
    # Skip the first 4 rows, preserve all column data types as text
    source_df = pd.read_excel(bls_employment_xlsx, header=4, dtype=str)

    # Drop unnecessary columns
    bls_employment_df = source_df.drop(columns=["Code", "County Name/State Abbreviation", "Unnamed: 5", "Unemployed", "(%)"])

    # Drop NaN rows
    bls_employment_df.dropna(inplace=True)

    # Keep only the main US states
    bls_employment_df = bls_employment_df[(bls_employment_df["Code.1"] <= "56")]

    # Create county_fips column
    bls_employment_df["county_fips"] = pd.to_numeric(bls_employment_df["Code.1"] + bls_employment_df["Code.2"])

    # Drop unnecessary columns
    bls_employment_df.drop(columns=["Code.1", "Code.2"], inplace=True)

    # Rename columns
    bls_employment_df.rename(columns={"Year": "year",
                                    "Force": "bls_labor_force",
                                    "Employed": "bls_employed"}, inplace=True)

    # Change column order
    bls_employment_df = bls_employment_df[["county_fips", "year", "bls_labor_force", "bls_employed"]]

    return bls_employment_df

In [3]:
# Define base path
basePath = "./source/BLS/employment/"

# Define a list of files to process
filesToProcess = ["laucnty20.xlsx", "laucnty21.xlsx", "laucnty22.xlsx"]

# Initialize the empty DataFrame
bls_employment_df = pd.DataFrame({})

# Loop through all files, process them and accumulate results in the new DataFrame
for file in filesToProcess:
    # Define a full path to the file
    bls_employment_xlsx = Path(f"{basePath}{file}")

    # Process the file
    df = bls_employment(bls_employment_xlsx)

    # Add processed data to the DataFrame
    bls_employment_df = pd.concat([bls_employment_df, df], ignore_index=True)

bls_employment_df.head()

Unnamed: 0,county_fips,year,bls_labor_force,bls_employed
0,1001,2020,26350,24955
1,1003,2020,98695,92639
2,1005,2020,8659,7995
3,1007,2020,8692,8063
4,1009,2020,25065,23944


In [4]:
# Save to file
bls_employment_df.to_csv(Path("./source/bls_employment.csv"), index=False, header=True)