In [1]:
##### Cleans Canadian labor data
# removes unnecessary variables and geographies and reformats data

import os
import pandas as pd

In [2]:
##### Load data

# Get the current working directory
cd = os.path.dirname(os.getcwd())

# Import data
labor = pd.read_csv(f"{cd}/Data/Raw/Sub_National/CAD_ag_census/farm_labor_01152026.csv")

CAD_CCS = pd.read_csv(f"{cd}/Data/Correspondence_tables/CAD_CCS.csv")

# Set save path
save_path = f"{cd}/Data/Clean/Labor/CAD_labor.csv"

In [3]:
labor.columns

Index(['REF_DATE', 'GEO', 'DGUID', 'Paid agricultural workers',
       'Unit of measure', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',
       'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',
       'DECIMALS'],
      dtype='object')

In [4]:
##### Clean labor

# drop unnecessary columns
columns_to_keep = ['GEO', 'DGUID', 'Paid agricultural workers', 'VALUE']
labor = labor[columns_to_keep]

# keep only total farm workers
labor = labor[labor['Paid agricultural workers'] == 'Agricultural workers, total']

# split GEO to get names and codes
labor['Geo_name'] = labor['GEO'].str.extract(r"^(.*?)(?:\s*\[)")
labor['Geo_code'] = labor['GEO'].str.extract(r"\[(.*?)\]")

# keep only data on census subdivisions
labor = labor[labor['Geo_code'].str.startswith("CCS")]

# extract CCSUID
labor['CCSUID'] = labor['DGUID'].str[-7:].astype('int64')

# merge with full CCS
labor = labor.merge(CAD_CCS, on='CCSUID', how='right')

# fill missing with 0's
labor['2021'] = labor['VALUE'].fillna(0)

# add units
labor['Units'] = 'Ag labor - jobs'

# re-order columns
columns_to_keep = ['CCSUID', 'Units', '2021']
labor = labor[columns_to_keep]


In [5]:
# Save cleaned data
labor.to_csv(save_path, index=False) 
