In [1]:
##### Cleans Brazil labor data
# extracts from excel format, combines, and cleans

import os
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [2]:
##### Load data

# Get the current working directory
cd = os.path.dirname(os.getcwd())

# Import data
BRA_codes = pd.read_csv(f"{cd}/Data/Correspondence_tables/BRA_municipalities.csv")

# Set save path
save_path = f"{cd}/Data/Clean/Labor/BRA_municipality_labor_IBGE.csv"

In [3]:
##### Define function to clean IBGE spreadsheet

def clean_ibge_spreadsheet(path):
    df = pd.read_excel(path, header=None, dtype=str)

    # Keep only relevant columns
    df = df.iloc[:, :3]

    df.columns = ["region", "sector", "2022"]

    df = df.dropna(how="all")

    junk_patterns = [
        "Tabela",
        "Variável",
        "Unidade da Federação",
        "Ano",
        "Total$"
    ]

    mask_junk = df["region"].str.contains(
        "|".join(junk_patterns),
        case=False,
        na=False
    )
    df = df[~mask_junk]

    # Forward-fill region names
    df["region"] = df["region"].ffill()

    # Extract state name
    state_name = df["region"].dropna().iloc[0]
    df["state"] = state_name

    # Extract municipality
    df["municipality"] = (
        df["region"]
        .str.extract(r"^(.*?)(?:\s*\([A-Z]{2}\))?$")[0]
        .str.strip()
    )

    # Drop state aggregate row
    df = df[df["region"] != state_name]

    # Clean values
    df["2022"] = df["2022"].replace(["-", "0"], np.nan)
    df["2022"] = pd.to_numeric(df["2022"], errors="coerce")

    df["sector"] = df["sector"].str.strip()

    # Only agriculture
    df = df[df["sector"] == "Agricultura, pecuária, produção florestal, pesca e aquicultura"]

    df = df[["state", "municipality", "2022"]]

    return df.reset_index(drop=True)

In [4]:
##### Combine all labor data

def clean_all_ag_labor(cd):
    folder = Path(cd) / "Data" / "Raw" / "Sub_National" / "Brazil" / "ag_labor"

    dfs = []

    for file in folder.glob("*.xlsx"):
        try:
            df = clean_ibge_spreadsheet(file)
            dfs.append(df)
        except Exception as e:
            print(f"⚠️ Failed to process {file.name}: {e}")

    if not dfs:
        raise ValueError("No files were successfully processed.")

    return pd.concat(dfs, ignore_index=True)

df = clean_all_ag_labor(cd)

In [5]:
##### Clean

# create MATCH column
df['MATCH'] = df['state'] + df['municipality']
BRA_codes['MATCH'] = BRA_codes['STATE_name'] + BRA_codes['IBGE_name']


# merge to get admin codes
labor = df.merge(BRA_codes, on='MATCH', how='outer')

# add units
labor['Units'] = 'Ag labor - jobs'

# reorder columns
columns_to_keep = ['MUNIP_CODE', 'Units', '2022']
labor = labor[columns_to_keep]

In [6]:
##### Save cleaned data
labor.to_csv(save_path, index=False)