In [1]:
##### Cleans South Africa labor and capital stock data
# cleans, combines regions, and reformats

import os
import pandas as pd
import lxml

In [2]:
##### Load data

# Get the current working directory
cd = os.path.dirname(os.getcwd())

# Import data
def load_stats_sa_html(path):
    tables = pd.read_html(path)
    return tables[0]

Eastern_Cape = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Eastern Cape Financial and production statistics.xls")
Free_State = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Free State Financial and production statistics.xls")
Gauteng = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Gauteng Financial and production statistics.xls")
KwaZulu_Natal = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Kwa-Zulu Natal Financial and production statistics.xls")
Limpopo = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Limpopo Financial and production statistics.xls")
Mpumalanga = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Mpumalanga Financial and production statistics.xls")
North_West = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture North West Financial and production statistics.xls")
Northern_Cape = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Northern Cape Financial and production statistics.xls")
Western_Cape = load_stats_sa_html(f"{cd}/Data/Raw/Sub_National/South_Africa/Census of commercial agriculture Western Cape Financial and production statistics.xls")

ZAF_codes = pd.read_csv(f"{cd}/Data/Correspondence_tables/ZAF_municipalities.csv")

# Set save path
save_path_capital = f"{cd}/Data/Clean/Capital_stock/ZAF_capital_stock.csv"
save_path_labor = f"{cd}/Data/Clean/Labor/ZAF_labor.csv"

In [3]:
##### Clean data

# append all provinces
South_Africa = pd.concat([Eastern_Cape, Free_State, Gauteng, KwaZulu_Natal, Limpopo, Mpumalanga, North_West, Northern_Cape, Western_Cape], ignore_index=True)

# drop unneeded variables
vars_to_keep = [
    'Family members involved in agricultural activities',
    'Paid employees',
    'Dryers for agricultural products',
    'Harvesters',
    'Mechanical appliances for chemical application',
    'Milking and dairy machines',
    'Other machinery and equipment for farming purposes',
    'Poultry-keeping machinery',
    'Presses and crushes and similar machinery',
    'Tractors'
]

South_Africa = South_Africa[South_Africa['H06'].isin(vars_to_keep)]

# drop if province or enterprise data
South_Africa = South_Africa.dropna(subset=['H04'])
South_Africa = South_Africa[South_Africa['H05'] != 99]

# drop gender data
South_Africa = South_Africa[South_Africa['H07'] != 'Male']
South_Africa = South_Africa[South_Africa['H07'] != 'Female']

In [4]:
##### Split into capital and labor

labor_var = ['Family members involved in agricultural activities', 'Paid employees']
capital_var = ['Dryers for agricultural products',
    'Harvesters',
    'Mechanical appliances for chemical application',
    'Milking and dairy machines',
    'Other machinery and equipment for farming purposes',
    'Poultry-keeping machinery',
    'Presses and crushes and similar machinery',
    'Tractors']

labor = South_Africa[South_Africa['H06'].isin(labor_var)]
capital_stock = South_Africa[South_Africa['H06'].isin(capital_var)]


In [5]:
##### Final clean

# sum by municipality 
labor_sum = labor.groupby('H04')['Y2017'].sum().reset_index()
capital_stock_sum = capital_stock.groupby('H04')['Y2017'].sum().reset_index()

# rename columns
labor_sum.rename(columns={'Y2017': '2017'}, inplace=True)
capital_stock_sum.rename(columns={'Y2017': '2017'}, inplace=True)
labor_sum.rename(columns={'H04': 'ADMN3_NAME'}, inplace=True)
capital_stock_sum.rename(columns={'H04': 'ADMN3_NAME'}, inplace=True)

# add units
capital_stock_sum['Units'] = 'Ag capital stock - count of tractors, machinery, and equipment'
labor_sum['Units'] = 'Ag labor - jobs'

# add geography codes
capital_stock_sum = capital_stock_sum.merge(ZAF_codes, on='ADMN3_NAME', how='inner')
labor_sum = labor_sum.merge(ZAF_codes, on='ADMN3_NAME', how='inner')

# re-order columns
capital_to_keep = ['CC_3', 'Units', '2017']
capital_stock_sum = capital_stock_sum[capital_to_keep]

labor_to_keep = ['CC_3', 'Units', '2017']
labor_sum = labor_sum[labor_to_keep]

In [6]:
##### Save cleaned data
labor_sum.to_csv(save_path_labor, index=False)
capital_stock_sum.to_csv(save_path_capital, index=False)