In [3]:
# Import Dependencies
import pandas as pd
from pathlib import Path

In [4]:
def bls_country_industry_metric(bls_employment_wages_by_industry_csv):
    # Read source data file to DataFrame
    source_df = pd.read_csv(bls_employment_wages_by_industry_csv)

    # Keep only required columns
    source_df = source_df[["area_fips", "own_code", "industry_code", "agglvl_code", "year", "annual_avg_estabs", "annual_avg_emplvl", "total_annual_wages"]]

    # Keep only county-level aggregations from the main US states
    source_df = source_df[(source_df["agglvl_code"] == 73) & (source_df["area_fips"] <= "56999")]

    # Summarize by own_code
    bls_country_industry_metric_df = source_df.groupby(["area_fips", "industry_code", "year"]).agg(
        {"annual_avg_estabs": "sum",
        "annual_avg_emplvl": "sum",
        "total_annual_wages": "sum"}
    ).reset_index()
    
    # Change data type
    bls_country_industry_metric_df["area_fips"] = bls_country_industry_metric_df["area_fips"].astype("int")

    # Rename columns
    bls_country_industry_metric_df.rename(columns={"area_fips": "county_fips",
                                                "annual_avg_estabs": "bls_annual_establishments",
                                                "annual_avg_emplvl": "bls_annual_employment",
                                                "total_annual_wages": "bls_total_annual_wages"}, inplace=True)

    # Change the order
    bls_country_industry_metric_df = bls_country_industry_metric_df[["county_fips", "industry_code", "year", "bls_annual_establishments", "bls_annual_employment", "bls_total_annual_wages"]]

    return bls_country_industry_metric_df

In [5]:
# Define base path
basePath = "./source/BLS/employment_wages_by_industry/"

# Define a list of files to process
filesToProcess = ["1011_2020.csv", "1011_2021.csv", "1011_2022.csv",
                  "1012_2020.csv", "1012_2021.csv", "1012_2022.csv",
                  "1013_2020.csv", "1013_2021.csv", "1013_2022.csv",
                  "1021_2020.csv", "1021_2021.csv", "1021_2022.csv",
                  "1022_2020.csv", "1022_2021.csv", "1022_2022.csv",
                  "1023_2020.csv", "1023_2021.csv", "1023_2022.csv",
                  "1024_2020.csv", "1024_2021.csv", "1024_2022.csv",
                  "1026_2020.csv", "1026_2021.csv", "1026_2022.csv",
                  "1027_2020.csv", "1027_2021.csv", "1027_2022.csv",
                  "1028_2020.csv", "1028_2021.csv", "1028_2022.csv",
                  "1029_2020.csv", "1029_2021.csv", "1029_2022.csv"]

# Initialize the empty DataFrame
country_industry_metric_df = pd.DataFrame({})

# Loop through all files, process them and accumulate results in the new DataFrame
for file in filesToProcess:
    # Define a full path to the file
    bls_employment_wages_by_industry_csv = Path(f"{basePath}{file}")

    # Process the file
    df = bls_country_industry_metric(bls_employment_wages_by_industry_csv)

    # Add processed data to the DataFrame
    country_industry_metric_df = pd.concat([country_industry_metric_df, df], ignore_index=True)

country_industry_metric_df.head()

Unnamed: 0,county_fips,industry_code,year,bls_annual_establishments,bls_annual_employment,bls_total_annual_wages
0,1001,1011,2020,24,182,11363434
1,1003,1011,2020,77,773,33427148
2,1005,1011,2020,28,267,14071797
3,1007,1011,2020,14,71,2933131
4,1009,1011,2020,13,79,2515672


In [6]:
# Save to file
country_industry_metric_df.to_csv(Path("./source/county_industry_metric.csv"), index=False, header=True)