In [1]:
from pandas import read_fwf
import numpy as np
import pandas as pd

In [2]:
# This script reads data files from the DHS survey
# https://dhsprogram.com/data/dataset/Zambia_Standard-DHS_2018.cfm
# and then outputs selected columns to a csv file

# Folder where data files are located
data_folder = '../Data/DHSSurvey/'
employee_survey_folder = 'ZMIR71FL/ZMIR71DT/'
# Choose the file prefix.
filename = 'ZMIR71FL'
filepath = data_folder + employee_survey_folder + filename + '.DTA'

In [3]:
# When labels are repeated, use the method below https://stackoverflow.com/questions/31782283/loading-stata-file-categorial-values-must-be-unique
with pd.io.stata.StataReader(filepath) as sr:
    value_labels = sr.value_labels()

df = pd.read_stata(
    filepath,
    convert_categoricals=False,
)

for col in value_labels:
    if col.lower() in df.columns:
        df[col.lower()].replace(value_labels[col], inplace=True)

In [4]:
# df

In [5]:
# label variable v005     "Women's individual sample weight (6 decimals)"
# label variable v024     "Region"
# label variable v025     "Type of place of residence"
# label variable v704     "Husband/partner's occupation"
# label variable v704a    "Husband/partner worked in last 7 days/12 months"
# label variable v705     "Husband/partner's occupation (grouped)"
# label variable v714     "Respondent currently working"
# label variable v714a    "Respondent has a job, but currently absent"
# label variable v716     "Respondent's occupation"
# label variable v717     "Respondent's occupation (grouped)"
# label variable v719     "Respondent works for family, others, self"
# label variable v721     "NA - Respondent works at home or away"
labels_services = ['v005','v024', 'v025', 'v704', 'v704a', 'v705', 'v714', 'v714a', 'v716','v717', 'v719', 'v721']
df_services = df[labels_services]

In [6]:
# Read the names of the columns from the .DO file and convert to dictionary
do = read_fwf(data_folder + employee_survey_folder + filename +'.DO', skiprows=2)

In [7]:
col_dict = dict(zip(do['caseid'], do['"Case Identification"']))

In [8]:
df_services = df_services.rename(columns=col_dict)

In [9]:
df_services

Unnamed: 0,"""Women's individual sample weight (6 decimals)""","""Region""","""Type of place of residence""","""Husband/partner's occupation""","""Husband/partner worked in last 7 days/12 months""","""Husband/partner's occupation (grouped)""","""Respondent currently working""","""Respondent has a job, but currently absent""","""Respondent's occupation""","""Respondent's occupation (grouped)""","""Respondent works for family, others, self""","""NA - Respondent works at home or away"""
0,1892890,eastern,rural,"subsistence farmers, fishers, hunters and gat...",worked last 7 days,agricultural - self employed,yes,,market-oriented skilled agricultural workers,agricultural - self employed,for family member,
1,1892890,eastern,rural,"subsistence farmers, fishers, hunters and gat...",worked last 12 months,agricultural - self employed,yes,,sales workers,sales,self-employed,
2,1892890,eastern,rural,"subsistence farmers, fishers, hunters and gat...",worked last 12 months,agricultural - self employed,no,no,"subsistence farmers, fishers, hunters and gat...",agricultural - self employed,for family member,
3,1892890,eastern,rural,"subsistence farmers, fishers, hunters and gat...",worked last 12 months,agricultural - self employed,no,no,"subsistence farmers, fishers, hunters and gat...",agricultural - self employed,self-employed,
4,1892890,eastern,rural,"subsistence farmers, fishers, hunters and gat...",worked last 12 months,agricultural - self employed,no,no,"subsistence farmers, fishers, hunters and gat...",agricultural - self employed,for family member,
...,...,...,...,...,...,...,...,...,...,...,...,...
13678,691259,muchinga,rural,"subsistence farmers, fishers, hunters and gat...",worked last 7 days,agricultural - self employed,yes,,"subsistence farmers, fishers, hunters and gat...",agricultural - self employed,self-employed,
13679,691259,muchinga,rural,cleaners and helpers,worked last 7 days,household and domestic,yes,,"subsistence farmers, fishers, hunters and gat...",agricultural - self employed,self-employed,
13680,691259,muchinga,rural,not working and didn't work in last 12 months,didn't work last 12 months,did not work,no,no,not working and didn't work in last 12 months,not working,,
13681,691259,muchinga,rural,"subsistence farmers, fishers, hunters and gat...",worked last 7 days,agricultural - self employed,yes,,"subsistence farmers, fishers, hunters and gat...",agricultural - self employed,self-employed,


In [10]:
# Calculate the occupation share within each region and per type of area - women
occupation_share_women = df.groupby(['v024', 'v025', 'v717'])['v005'].sum().unstack(fill_value=0)/1000000
occupation_share_women = occupation_share_women.div(occupation_share_women.sum(axis=1), axis=0) * 100  # Calculate share as percentage
occupation_share_women = occupation_share_women.applymap('{:.2f}'.format)
occupation_share_women

Unnamed: 0_level_0,v717,agricultural - self employed,clerical,household and domestic,not working,professional/technical/managerial,sales,services,skilled manual,unskilled manual
v024,v025,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
central,rural,34.54,0.06,1.77,46.06,2.55,8.39,0.56,0.2,5.87
central,urban,0.72,1.06,4.93,53.55,6.88,24.19,4.39,0.76,3.51
copperbelt,rural,24.03,0.0,2.18,49.64,2.05,13.59,1.27,0.59,6.65
copperbelt,urban,2.2,0.65,6.1,55.97,6.3,17.63,4.63,1.67,4.86
eastern,rural,34.1,0.14,1.44,39.89,1.73,8.65,0.42,0.98,12.64
eastern,urban,9.01,0.73,3.51,40.21,9.77,21.65,5.57,1.96,7.6
luapula,rural,43.38,0.0,0.84,34.4,1.86,13.59,0.4,0.45,5.09
luapula,urban,13.76,0.66,4.97,49.12,5.32,18.48,2.69,0.82,4.17
lusaka,rural,7.22,0.0,7.13,60.24,4.56,12.79,1.37,0.26,6.43
lusaka,urban,1.12,1.6,8.47,46.2,7.61,20.58,4.56,1.69,8.17


In [11]:
# Calculate the occupation share within each region - women - to compare with table 3.7.1 from the DHS report
occupation_share_women_DHSreport = df.groupby(['v024', 'v717'])['v005'].sum().unstack(fill_value=0)/1000000
occupation_share_women_DHSreport = occupation_share_women_DHSreport.drop(['not working'], axis=1)
occupation_share_women_DHSreport = occupation_share_women_DHSreport.div(occupation_share_women_DHSreport.sum(axis=1), axis=0) * 100  # Calculate share as percentage
occupation_share_women_DHSreport = occupation_share_women_DHSreport.applymap('{:.2f}'.format)
occupation_share_women_DHSreport

v717,agricultural - self employed,clerical,household and domestic,professional/technical/managerial,sales,services,skilled manual,unskilled manual
v024,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
central,45.85,0.75,5.41,7.67,26.19,3.48,0.75,9.92
copperbelt,11.52,1.24,12.4,12.74,38.04,9.3,3.39,11.36
eastern,51.28,0.36,2.86,4.64,17.26,1.83,1.84,19.94
luapula,58.6,0.25,2.91,4.31,23.72,1.51,0.86,7.85
lusaka,3.63,2.68,15.96,13.88,37.67,7.99,2.91,15.28
muchinga,57.32,0.6,3.13,2.56,25.43,3.47,1.98,5.51
north western,44.17,1.25,2.86,7.72,30.55,1.88,1.87,9.7
northern,55.83,0.23,3.31,4.08,26.2,1.99,1.26,7.1
southern,29.75,0.61,14.61,4.09,36.45,4.88,0.98,8.64
western,52.53,0.23,2.38,3.87,27.08,1.54,1.38,11.0


In [12]:
# Calculate the share of each occupation within each region and per type of area = men
occupation_share_men = df.groupby(['v024', 'v025', 'v705'])['v005'].sum().unstack(fill_value=0)/1000000
occupation_share_men = occupation_share_men.div(occupation_share_men.sum(axis=1), axis=0) * 100  # Calculate share as percentage
occupation_share_men = occupation_share_men.applymap('{:.2f}'.format)
occupation_share_men

Unnamed: 0_level_0,v705,agricultural - self employed,clerical,did not work,don't know,household and domestic,professional/technical/managerial,sales,services,skilled manual,unskilled manual
v024,v025,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
central,rural,53.29,0.24,5.45,0.33,3.98,4.39,6.17,3.15,11.55,11.44
central,urban,5.7,0.37,4.77,0.0,8.41,14.81,16.2,10.95,32.39,6.4
copperbelt,rural,30.18,0.71,11.36,0.0,7.31,8.08,4.71,4.84,13.09,19.72
copperbelt,urban,5.93,0.17,6.61,0.43,7.16,14.54,9.06,7.56,39.82,8.7
eastern,rural,37.28,0.41,18.94,0.43,2.8,3.12,8.97,2.41,12.12,13.53
eastern,urban,9.33,1.94,7.17,0.52,5.09,12.56,16.97,5.82,31.3,9.3
luapula,rural,69.56,0.16,2.88,0.16,1.33,5.34,5.55,1.01,6.31,7.69
luapula,urban,19.66,1.2,1.13,0.29,3.91,14.42,13.51,8.97,24.01,12.9
lusaka,rural,19.26,0.09,11.62,0.5,6.74,10.09,5.09,9.05,18.78,18.77
lusaka,urban,1.85,0.85,4.38,0.0,5.69,17.09,16.26,8.52,37.73,7.63


In [14]:
# Save the two DataFrame to a CSV file
occupation_share_men.to_csv(data_folder + 'employee_survey_men.csv')
occupation_share_women.to_csv(data_folder + 'employee_survey_women.csv')