In [1]:

# =============================================================================
# Agricultural and Income Questionnaire: Checks and Balances
# =============================================================================
import numpy as np
import pandas as pd
import os 
os.chdir('C:/Users/rodri/Dropbox/Chied_Field_June_19/Data/Income')

# =============================================================================
# Import data
# =============================================================================
data_test = pd.read_stata("Agriculture testing-dataset.dta", convert_categoricals=False)
data_test.rename(columns={'householdid':'hhid'}, inplace=True) 
data_test.rename(columns={'interviewername':'enumerator'}, inplace=True) 
#data20 = pd.read_stata("income_200719.dta", convert_categoricals=False)
# Last version already includes previous interviews. Except for the testing.
data21 = pd.read_stata("income_230719.dta", convert_categoricals=False)
data22 = pd.read_stata("income_240719.dta", convert_categoricals=False)

datasets = [data_test, data21, data22]

data = pd.concat(datasets, sort=False)


## Look at duplicates:
pd.value_counts(data['hhid'])
# 216 seems double-entry but seem survey. Just remove one.
data.drop((data.loc[(data['hhid']==216) & (data['duration']==3666)]).index, inplace=True)
data.loc[data['intervieweename']=='Estery Yson', 'hhid'] = 602
data.loc[data['intervieweename']=='A chiwa Yahaya', 'hhid'] = 8
data.reset_index(drop=True, inplace=True)

#data.to_csv('income_data.csv')

percentiles = [0.05, 0.1, .25, .5, .75, 0.8, 0.9, 0.95, 0.99]

list_crops = ['maize', 'groundnut', 'groundbean', 'sweetpotatoe', 'fingermillet', 'sorghum', 'pearlmillet', 'soyabean', 'pigeonpeas', 'cotton', 'nkhwani', 'cassava',  'sugarcane',  'tomatoes',  'therereokra', 'tanaposi' ] 

# Rename some variables
data.rename(columns={'unitssoldpearlmillet2':'unitssoldpearlmilletout2'}, inplace=True) 
data.rename(columns={'unitssoldsoyabean2':'unitssoldsoyabeanout2'}, inplace=True)
data.rename(columns={'soldquantitygroundbeanin':'soldquantitygroundbeanin'}, inplace=True)

## Remove 9999 observations=====================================
data.replace([9999, 9999.00], np.nan, inplace=True)




# =============================================================================
# Check number, size, and value plots
# =============================================================================
data['total_plots'] = data['manyplot'].fillna(0) + data['rentinmany'].fillna(0)
sumplots = data[['manyplot','rentinmany','total_plots']].describe(percentiles=percentiles)
N_plots = int(data[['total_plots']].max())

print('===========================================')
print('Summary number of plots')
print(sumplots)
##### STOP RUN    


## Check plot size and value =========================================
#units area plots
units_plot = pd.value_counts(data['unitsareaplot_1'])
# small futbol fields are around 1 acre
# square meters to acres: 0.000247105

for i in range(1,N_plots+1):
    data['area_plot_acr_'+str(i)] = data['areaplot_'+str(i)]
    data.loc[data['unitsareaplot_'+str(i)]==2.0, 'area_plot_acr_'+str(i)] = data.loc[data['unitsareaplot_'+str(i)]==2.0, 'areaplot_'+str(i)]*2.47105
    data.loc[data['unitsareaplot_'+str(i)]==4.0, 'area_plot_acr_'+str(i)] = data.loc[data['unitsareaplot_'+str(i)]==4.0, 'areaplot_'+str(i)]*0.000247105

for i in range(1,N_plots+1):
    data['ratio_value_rent_'+str(i)] = np.nan
    data['p_acre_plot_'+str(i)]= np.nan

#Check ratio value vs rentout: 
for i in range(1,N_plots+1):
    data['ratio_value_rent_'+str(i)]  =  data['valueplot_'+str(i)] / data['rentoutplot_'+str(i)] 

## Check price per acre:
for i in range(1,N_plots+1):
    data['p_acre_plot_'+str(i)]  =  data['valueplot_'+str(i)] / data['area_plot_acr_'+str(i)] 
    
# hh aggregate variables
data['hh_area_plots'] = 0
data['hh_rentout_plots'] = 0
data['hh_value_plots'] = 0

    
### Add at household level:    
for i in range(1,N_plots+1):
    data['hh_area_plots'] += data['area_plot_acr_'+str(i)].fillna(0)
    data['hh_rentout_plots'] += data['rentoutplot_'+str(i)].fillna(0)
    data['hh_value_plots'] += data['valueplot_'+str(i)].fillna(0)


data['hh_p_acre_plots']  =  data['hh_value_plots'] / data['hh_area_plots'] 
data['hh_ratio_value_rent'] = data['hh_value_plots'] / data['hh_rentout_plots'] 


    
print('===============================================================')
print('Check: Distribution First Reported Plot')
print('===============================================================')    
sum_1plot = data[['area_plot_acr_1','rentoutplot_1','valueplot_1', 'ratio_value_rent_1', 'p_acre_plot_1']].describe(percentiles=percentiles)
print(sum_1plot)
### STOP RUN

print('===============================================================')
print('Check: Distribution Second Reported Plot')
print('===============================================================')    
sum_2plot = data[['area_plot_acr_2','rentoutplot_2','valueplot_2', 'ratio_value_rent_2','p_acre_plot_2']].describe(percentiles=percentiles)
print(sum_2plot)
##### STOP RUN


# =============================================================================
# Check: land area, rentout value, and land value at household level
# =============================================================================
sum_hhplots = data[['hh_area_plots','hh_rentout_plots','hh_value_plots', 'hh_ratio_value_rent', 'hh_p_acre_plots']].describe(percentiles=percentiles)
print('')
print('===============================================================')
print('Check: Distribution land at household level')
print('===============================================================')
print(sum_hhplots)



Summary number of plots
         manyplot  rentinmany  total_plots
count  139.000000        11.0   139.000000
mean     1.805755         1.0     1.884892
std      1.006340         0.0     0.993305
min      0.000000         1.0     0.000000
5%       1.000000         1.0     1.000000
10%      1.000000         1.0     1.000000
25%      1.000000         1.0     1.000000
50%      2.000000         1.0     2.000000
75%      2.000000         1.0     2.000000
80%      2.000000         1.0     2.000000
90%      3.000000         1.0     3.000000
95%      3.000000         1.0     3.000000
99%      4.620000         1.0     4.620000
max      8.000000         1.0     8.000000
Check: Distribution First Reported Plot
       area_plot_acr_1  rentoutplot_1   valueplot_1  ratio_value_rent_1  \
count       138.000000     128.000000  1.150000e+02          113.000000   
mean          2.773829   16441.562500  2.927043e+05           20.573016   
std           8.643723   15797.601056  5.329122e+05           35.6

In [2]:

## REINTERVIEW THOSE HOUSEHOLDS WITH EXTREME VALUES:
big_areas = data.loc[data['hh_area_plots']>5,['enumerator','intervieweename','hh_area_plots','hh_rentout_plots','hh_value_plots']]
print('')
print('===============================================================')
print('Check: Potential outliers land size')
print('===============================================================')
big_areas


Check: Potential outliers land size


Unnamed: 0,enumerator,intervieweename,hh_area_plots,hh_rentout_plots,hh_value_plots
20,2,Lakia James,6.206575,17000.0,150000.0
23,6,Esnart James,25.0,30000.0,100000.0
30,5,Ainess Samson,5.0,122000.0,680000.0
34,5,Esme Kalimu,5.2,34000.0,500000.0
41,4,Lukiya Alaki,15.0,75000.0,1070000.0
45,2,Agness Amos,5.6,95000.0,420000.0
56,7,Zione kalipinde,7.7,82035.0,750000.0
60,6,Lena Julius,101.25,40000.0,350000.0
73,3,Mina Rashid,7.6,61000.0,1000000.0
75,7,Magret Kumwima,12.3,8000.0,1650000.0


In [3]:

# =============================================================================
#  Summarize land rights 
# =============================================================================

sum_landrights = data[['rightsellland', 'rightbequeathplot', 'chiefpreventsell', 'chiefpreventbequeat', 'landdispute' ]].describe()
print(sum_landrights)
### STOP RUN


#  sum_solverdispute = pd.value_counts(data['whosolveddispute'])
## most land disputes solved by the chief!! (previous year)

#  sum_expropiation = pd.value_counts(data['expropiation'])
## Expropiation techniques: Cultivate land, demarcate borders, report chief



       rightsellland  rightbequeathplot  chiefpreventsell  \
count     137.000000         137.000000        137.000000   
mean        1.518248           1.525547          1.875912   
std         0.501501           0.501179          0.330891   
min         1.000000           1.000000          1.000000   
25%         1.000000           1.000000          2.000000   
50%         2.000000           2.000000          2.000000   
75%         2.000000           2.000000          2.000000   
max         2.000000           2.000000          2.000000   

       chiefpreventbequeat  landdispute  
count           137.000000   137.000000  
mean              1.875912     1.854015  
std               0.330891     0.354387  
min               1.000000     1.000000  
25%               2.000000     2.000000  
50%               2.000000     2.000000  
75%               2.000000     2.000000  
max               2.000000     2.000000  


In [4]:
#%% Convert agricultural outputs to kgs
# =============================================================================

# Import conversion rates
crop_unit = pd.read_csv("crop_conversions_kg.csv")
crop_unit.set_index('unit', inplace=True)

#Generate empty variables
for crop in list_crops:
    data['total_kg_'+crop] = np.nan 
    data['sold_kg_'+crop] = np.nan 
    data['sold_insiders_kg_'+crop] = np.nan 
    data['sold_outsiders_kg_'+crop] = np.nan 
    data['sold_outside_kg_'+crop] = np.nan 
    data['store_kg_'+crop] = np.nan 
    data['lost_kg_'+crop] = np.nan
    data['total2_kg_'+crop] =np.nan
    data['sold_bigger_total_'+crop] = 0
    data['lost_bigger_total_'+crop] = 0
    data['store_bigger_total_'+crop] = 0
    data['soldloststore_bigger_total_'+crop] = 0
    data['p_'+crop] = np.nan
    data['y_'+crop] = 0
    data['y_agric'] = 0
    data[['unitstotal'+crop, 'unitssold'+crop, 'unitsstore'+crop,'unitslost'+crop]].replace(np.nan, 0, inplace=True)

    
       
# =============================================================================
# Main Loop: Conversion to kgs for all crops and questions    

data.replace(np.nan, 0, inplace=True)
for i in range(len(data)): 
    for crop in list_crops:
        data.iloc[i, data.columns.get_loc('total_kg_'+crop)] = data.iloc[i,data.columns.get_loc('totalamount'+crop)]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitstotal'+crop)]),'conversionkg']
        data.iloc[i, data.columns.get_loc('sold_kg_'+crop)] = data.iloc[i,data.columns.get_loc('soldquantity'+crop)]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitssold'+crop)]),'conversionkg']
        data.iloc[i, data.columns.get_loc('sold_insiders_kg_'+crop)] = data.iloc[i,data.columns.get_loc('soldquantity'+crop+'in')]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitssold'+crop+'in')]),'conversionkg']
        data.iloc[i, data.columns.get_loc('sold_outsiders_kg_'+crop)] = data.iloc[i,data.columns.get_loc('soldquantity'+crop+'out')]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitssold'+crop+'out')]),'conversionkg']
        data.iloc[i, data.columns.get_loc('sold_outside_kg_'+crop)] = data.iloc[i,data.columns.get_loc('soldquantity'+crop+'out2')]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitssold'+crop+'out2')]),'conversionkg']        
        data.iloc[i, data.columns.get_loc('store_kg_'+crop)] = data.iloc[i,data.columns.get_loc('store'+crop+'quantity')]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitsstore'+crop)]),'conversionkg']
        data.iloc[i, data.columns.get_loc('lost_kg_'+crop)] = data.iloc[i,data.columns.get_loc('lost'+crop+'quantity')]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitslost'+crop)]),'conversionkg']
        # per plot (wrong)
        #for plot in range(1,N_p+1):
            #data.iloc[i, data.columns.get_loc('kg_'+crop+'_'+plot)] = data.iloc[i,data.columns.get_loc(crop+'perplot_'+plot)]*crop_unit.loc[int(data.iloc[i, data.columns.get_loc('unitsplot'+crop+'_'+plot)]),'conversionkg']
           

for crop in list_crops:
    data['total2_kg_'+crop] = data['sold_kg_'+crop].fillna(0) +data['store_kg_'+crop].fillna(0) +data['lost_kg_'+crop].fillna(0) 
   

#Summary total output kg:
sum_kg = (data[['total_kg_maize', 'total_kg_groundnut', 'total_kg_groundbean', 'total_kg_sweetpotatoe', 'total_kg_fingermillet', 'total_kg_sorghum', 'total_kg_pearlmillet', 'total_kg_soyabean', 'total_kg_pigeonpeas', 'total_kg_cotton', 'total_kg_nkhwani', 'total_kg_cassava',  'total_kg_sugarcane',  'total_kg_tomatoes',  'total_kg_therereokra', 'total_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)

print('===============================================================')
print('Check: Distribution of crop production (in kg)')
print('===============================================================')  
sum_kg.dropna(axis=1, how='any')
## STOP RUN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


Check: Distribution of crop production (in kg)


Unnamed: 0,total_kg_maize,total_kg_groundnut,total_kg_groundbean,total_kg_sweetpotatoe,total_kg_fingermillet,total_kg_sorghum,total_kg_pigeonpeas,total_kg_nkhwani,total_kg_tomatoes,total_kg_therereokra
count,129.0,91.0,10.0,12.0,4.0,3.0,52.0,15.0,4.0,11.0
mean,363.294574,174.285714,32.25,156.666667,18.125,40.833333,46.798077,30.166667,18.125,22.272727
std,546.472172,163.478373,28.490983,197.107877,21.54211,51.981567,50.053722,18.039309,21.347814,23.250122
min,25.0,5.0,5.0,5.0,2.5,2.5,1.0,5.0,5.0,5.0
5%,75.0,16.25,5.0,16.0,3.625,4.25,5.0,5.0,5.375,5.0
10%,100.0,25.0,5.0,25.0,4.75,6.0,5.5,7.0,5.75,5.0
25%,150.0,50.0,15.625,81.25,8.125,11.25,23.75,16.25,6.875,5.0
50%,250.0,150.0,25.0,100.0,10.0,20.0,27.5,25.0,8.75,12.5
75%,400.0,237.5,43.75,162.5,20.0,60.0,50.0,50.0,20.0,32.5
80%,400.0,250.0,50.0,190.0,26.0,68.0,66.0,50.0,26.0,40.0


In [5]:

# Summary total sellings kg:
sum_sold_kg= (data[['sold_kg_maize', 'sold_kg_groundnut', 'sold_kg_groundbean', 'sold_kg_sweetpotatoe', 'sold_kg_fingermillet', 'sold_kg_sorghum', 'sold_kg_pearlmillet', 'sold_kg_soyabean', 'sold_kg_pigeonpeas', 'sold_kg_cotton', 'sold_kg_nkhwani', 'sold_kg_cassava',  'sold_kg_sugarcane',  'sold_kg_tomatoes',  'sold_kg_therereokra', 'sold_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)
print('===============================================================')
print('Check: Distribution of crop Sellings (in kg)')
print('===============================================================')  
sum_sold_kg.dropna(axis=1, how='any')
## STOP RUN

Check: Distribution of crop Sellings (in kg)


Unnamed: 0,sold_kg_maize,sold_kg_groundnut,sold_kg_sweetpotatoe,sold_kg_pigeonpeas,sold_kg_tomatoes
count,13.0,35.0,3.0,27.0,2.0
mean,56.923077,137.857143,54.166667,37.092593,80.0
std,42.843633,128.956706,43.898557,53.274694,56.568542
min,5.0,5.0,12.5,1.0,40.0
5%,5.0,8.5,16.25,3.6,44.0
10%,6.0,24.0,20.0,8.0,48.0
25%,30.0,50.0,31.25,10.0,60.0
50%,50.0,100.0,50.0,20.0,80.0
75%,90.0,150.0,75.0,32.5,100.0
80%,96.0,210.0,80.0,47.0,104.0


In [6]:
sum_sold_kg_inside = (data[['sold_insiders_kg_maize', 'sold_insiders_kg_groundnut', 'sold_insiders_kg_groundbean', 'sold_insiders_kg_sweetpotatoe', 'sold_insiders_kg_fingermillet', 'sold_insiders_kg_sorghum', 'sold_insiders_kg_pearlmillet', 'sold_insiders_kg_soyabean', 'sold_insiders_kg_pigeonpeas', 'sold_insiders_kg_cotton', 'sold_insiders_kg_nkhwani', 'sold_insiders_kg_cassava',  'sold_insiders_kg_sugarcane',  'sold_insiders_kg_tomatoes',  'sold_insiders_kg_therereokra', 'sold_insiders_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)
print('===============================================================')
print('Check: Distribution of crop Sellings to Villagers')
print('===============================================================')  
sum_sold_kg_inside.dropna(axis=1, how='any')


Check: Distribution of crop Sellings to Villagers


Unnamed: 0,sold_insiders_kg_maize,sold_insiders_kg_groundnut,sold_insiders_kg_pigeonpeas,sold_insiders_kg_tomatoes
count,10.0,6.0,4.0,2.0
mean,54.0,123.333333,45.25,70.0
std,47.128665,133.927841,70.618104,70.710678
min,5.0,5.0,1.0,20.0
5%,5.0,5.0,1.6,25.0
10%,5.0,5.0,2.2,30.0
25%,15.0,16.25,4.0,45.0
50%,50.0,75.0,15.0,70.0
75%,80.0,235.0,56.25,95.0
80%,92.0,280.0,75.0,100.0


In [7]:
#Summary sellings in village to outsiders
sum_sold_outsiders_kg_in = (data[['sold_outsiders_kg_maize', 'sold_outsiders_kg_groundnut', 'sold_outsiders_kg_groundbean', 'sold_outsiders_kg_sweetpotatoe', 'sold_outsiders_kg_fingermillet', 'sold_outsiders_kg_sorghum', 'sold_outsiders_kg_pearlmillet', 'sold_outsiders_kg_soyabean', 'sold_outsiders_kg_pigeonpeas', 'sold_outsiders_kg_cotton', 'sold_outsiders_kg_nkhwani', 'sold_outsiders_kg_cassava',  'sold_outsiders_kg_sugarcane',  'sold_outsiders_kg_tomatoes',  'sold_outsiders_kg_therereokra', 'sold_outsiders_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)
print('===============================================================')
print('Check: Distribution of crop Sellings to Outsiders in Village')
print('===============================================================')  
sum_sold_outsiders_kg_in.dropna(axis=1, how='any')

Check: Distribution of crop Sellings to Outsiders in Village


Unnamed: 0,sold_outsiders_kg_maize,sold_outsiders_kg_groundnut,sold_outsiders_kg_sweetpotatoe,sold_outsiders_kg_pigeonpeas
count,3.0,26.0,2.0,14.0
mean,66.666667,128.269231,31.25,42.357143
std,28.867513,123.182323,26.516504,62.598362
min,50.0,10.0,12.5,3.0
5%,50.0,18.75,14.375,7.55
10%,50.0,47.5,16.25,10.0
25%,50.0,50.0,21.875,15.0
50%,50.0,100.0,31.25,25.0
75%,75.0,150.0,40.625,45.0
80%,80.0,150.0,42.5,50.0


In [8]:
# Summary sellings outside village:
sum_sold_outside_kg_out = (data[['sold_outside_kg_maize', 'sold_outside_kg_groundnut', 'sold_outside_kg_groundbean', 'sold_outside_kg_sweetpotatoe', 'sold_outside_kg_fingermillet', 'sold_outside_kg_sorghum', 'sold_outside_kg_pearlmillet', 'sold_outside_kg_soyabean', 'sold_outside_kg_pigeonpeas', 'sold_outside_kg_cotton', 'sold_outside_kg_nkhwani', 'sold_outside_kg_cassava',  'sold_outside_kg_sugarcane',  'sold_outside_kg_tomatoes',  'sold_outside_kg_therereokra', 'sold_outside_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)

# Sum transportation costs
sum_transport_c = (data[['transcostmaizeout', 'transcostgroundnutout', 'transcostgroundbeanout', 'transcostsweetpotatoeout', 'transcostfingermilletout', 'transcostsorghumout', 'transcostpearlmilletout', 'transcostsoyabeanout', 'transcostpigeonpeasout', 'transcostcottonout', 'transcostnkhwaniout', 'transcostcassavaout',  'transcostsugarcaneout',  'transcosttomatoesout',  'transcosttherereokraout', 'transcosttanaposiout']].replace(0,np.nan)).describe(percentiles=percentiles)



# Summary loses kg:
sum_lost_kg= (data[['lost_kg_maize', 'lost_kg_groundnut', 'lost_kg_groundbean', 'lost_kg_sweetpotatoe', 'lost_kg_fingermillet', 'lost_kg_sorghum', 'lost_kg_pearlmillet', 'lost_kg_soyabean', 'lost_kg_pigeonpeas', 'lost_kg_cotton', 'lost_kg_nkhwani', 'lost_kg_cassava',  'lost_kg_sugarcane',  'lost_kg_tomatoes',  'lost_kg_therereokra', 'lost_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)
print('===============================================================')
print('Check: Distribution of crop Lost (in kg)')
print('===============================================================')  
sum_lost_kg.dropna(axis=1, how='any')
## STOP RUN

Check: Distribution of crop Lost (in kg)


Unnamed: 0,lost_kg_maize,lost_kg_groundnut,lost_kg_pigeonpeas,lost_kg_nkhwani,lost_kg_tomatoes
count,18.0,8.0,3.0,6.0,4.0
mean,49.5,35.0625,43.333333,41.666667,13.75
std,60.721641,68.557816,50.083264,33.565856,10.307764
min,1.0,1.0,5.0,5.0,5.0
5%,1.0,1.35,7.0,8.75,5.0
10%,1.35,1.7,9.0,12.5,5.0
25%,6.25,2.375,15.0,21.25,5.0
50%,37.5,7.5,25.0,37.5,12.5
75%,50.0,20.0,62.5,50.0,21.25
80%,80.0,34.0,70.0,50.0,22.0


In [9]:
# Summary Store kg:
sum_store_kg= (data[['store_kg_maize', 'store_kg_groundnut', 'store_kg_groundbean', 'store_kg_sweetpotatoe', 'store_kg_fingermillet', 'store_kg_sorghum', 'store_kg_pearlmillet', 'store_kg_soyabean', 'store_kg_pigeonpeas', 'store_kg_cotton', 'store_kg_nkhwani', 'store_kg_cassava',  'store_kg_sugarcane',  'store_kg_tomatoes',  'store_kg_therereokra', 'store_kg_tanaposi']].replace(0,np.nan)).describe(percentiles=percentiles)
print('===============================================================')
print('Check: Distribution of crop store (in kg)')
print('===============================================================')  
sum_store_kg.dropna(axis=1, how='any')
## STOP RUN

Check: Distribution of crop store (in kg)


Unnamed: 0,store_kg_maize,store_kg_groundnut,store_kg_groundbean,store_kg_fingermillet,store_kg_sorghum,store_kg_pigeonpeas,store_kg_nkhwani,store_kg_therereokra
count,111.0,77.0,9.0,4.0,2.0,48.0,3.0,4.0
mean,246.283784,92.727273,95.944444,16.25,52.5,23.989583,6.666667,4.375
std,569.806971,123.520975,200.580166,22.776084,67.175144,20.867232,2.886751,1.25
min,5.0,5.0,1.0,2.5,5.0,1.0,5.0,2.5
5%,25.0,5.0,2.6,2.5,9.75,2.5,5.0,2.875
10%,50.0,11.5,4.2,2.5,14.5,5.0,5.0,3.25
25%,87.5,25.0,12.5,2.5,28.75,10.0,5.0,4.375
50%,125.0,50.0,25.0,6.25,52.5,20.0,5.0,5.0
75%,217.5,100.0,44.999999,20.0,76.25,25.0,7.5,5.0
80%,250.0,145.0,66.999999,26.0,81.0,28.0,8.0,5.0


In [10]:
for crop in list_crops:
    data['sold_bigger_total_'+crop] = 1*(data['sold_kg_'+crop].fillna(0)> data['total_kg_'+crop].fillna(0)+5)
    data['lost_bigger_total_'+crop] = 1*(data['lost_kg_'+crop].fillna(0)> data['total_kg_'+crop].fillna(0)+5)
    data['store_bigger_total_'+crop] = 1*(data['store_kg_'+crop].fillna(0)> data['total_kg_'+crop].fillna(0)+5)
    data['soldloststore_bigger_total'+crop] = 1*((data['sold_kg_'+crop].fillna(0) +data['store_kg_'+crop].fillna(0) +data['lost_kg_'+crop].fillna(0))>data['total_kg_'+crop]+5)
    
check_sold_bigger_total = data[['sold_bigger_total_maize', 'sold_bigger_total_groundnut', 'sold_bigger_total_groundbean', 'sold_bigger_total_sweetpotatoe', 'sold_bigger_total_fingermillet', 'sold_bigger_total_sorghum', 'sold_bigger_total_pearlmillet', 'sold_bigger_total_soyabean', 'sold_bigger_total_pigeonpeas', 'sold_bigger_total_cotton', 'sold_bigger_total_nkhwani', 'sold_bigger_total_cassava',  'sold_bigger_total_sugarcane',  'sold_bigger_total_tomatoes',  'sold_bigger_total_therereokra', 'sold_bigger_total_tanaposi']]


#Get the households that reported larger amounts than total:

list_hh_check_sell = []
list_hh_check_lost = []
list_hh_check_store = []
list_hh_check = []

for crop in list_crops:
    liers_sell = data.loc[data['sold_bigger_total_'+crop]==1, 'intervieweename']
    liers_store = data.loc[data['store_bigger_total_'+crop]==1, 'intervieweename']
    liers_lost = data.loc[data['lost_bigger_total_'+crop]==1, 'intervieweename']
    liers = data.loc[data['soldloststore_bigger_total_'+crop]==1, 'intervieweename']
    
    list_hh_check_sell.append(liers_sell)
    list_hh_check_store.append(liers_store)
    list_hh_check_lost.append(liers_lost)
    list_hh_check.append(liers)

# sellings check:
hh_to_check_sell = pd.concat(list_hh_check_sell, axis=1)
hh_to_check_sell.columns = list_crops
print('')
print('===============================================================')
print('Check: Households-crop combination where SELLINGS larger than total produced')
print('===============================================================')
print(hh_to_check_sell.dropna(axis=1,  how='all'))
###STOP RUN



Check: Households-crop combination where SELLINGS larger than total produced
          maize        pigeonpeas       tomatoes
20          NaN       Lakia James            NaN
82   Elia Amini               NaN            NaN
84          NaN               NaN    Haroon Elia
86          NaN               NaN  Nelie Kuyenda
104         NaN  Samson Kalipende            NaN


In [11]:
# Store quantity check:
hh_to_check_store = pd.concat(list_hh_check_store, axis=1)
hh_to_check_store.columns = list_crops
print('')
print('===============================================================')
print('Check: Households-crop combination where STORED larger than total produced')
print('===============================================================')
print(hh_to_check_store.dropna(axis=1, how='all'))
### STOP RUN
    


Check: Households-crop combination where STORED larger than total produced
            maize      groundnut  groundbean        pigeonpeas
5             NaN            NaN         NaN     Chrisy Baison
70            NaN  Asiyatu Aloni         NaN               NaN
78   Emily Duncan            NaN         NaN               NaN
82            NaN            NaN  Elia Amini               NaN
104           NaN            NaN         NaN  Samson Kalipende
120           NaN   Patuma Mores         NaN               NaN


In [12]:
    
hh_to_check_lost = pd.concat(list_hh_check_lost, axis=1)
hh_to_check_lost.columns = list_crops
print('')
print('===============================================================')
print('Check: Households-crop combination where LOST larger than total produced')
print('===============================================================')
print(hh_to_check_lost.dropna(axis=1,  how='all'))
### STOP RUN


Check: Households-crop combination where LOST larger than total produced
       groundnut  pigeonpeas           cotton          nkhwani     tomatoes
6    Alesi Isaac         NaN              NaN              NaN          NaN
56           NaN         NaN  Zione kalipinde              NaN          NaN
64           NaN         NaN              NaN     Saidi Jawadu          NaN
84           NaN         NaN              NaN              NaN  Haroon Elia
103          NaN  Roda Eliya              NaN              NaN          NaN
110          NaN         NaN              NaN  Chrissy Mustafa          NaN


In [13]:
hh_to_check = pd.concat(list_hh_check, axis=1)
hh_to_check.columns = list_crops
print('')
print('===============================================================')
print('Check: Households-crop combination where SELL+LOST+STORED larger than total produced')
print('===============================================================')
print(hh_to_check.dropna(axis=1).to_string())



Check: Households-crop combination where SELL+LOST+STORED larger than total produced
Empty DataFrame
Columns: [maize, groundnut, groundbean, sweetpotatoe, fingermillet, sorghum, pearlmillet, soyabean, pigeonpeas, cotton, nkhwani, cassava, sugarcane, tomatoes, therereokra, tanaposi]
Index: []


In [14]:

# =============================================================================
#  get PRICES per kg
# =============================================================================
for crop in list_crops:
    data['p_'+crop] = (data['soldvalue'+crop].replace(0,np.nan)).dropna()  / (data['sold_kg_'+crop].replace(0,np.nan)).dropna() 
    #DF = data[['soldvalue'+crop, 'sold_kg_'+crop]].dropna()
sum_prices = data[['p_maize', 'p_groundnut', 'p_groundbean', 'p_sweetpotatoe', 'p_fingermillet', 'p_sorghum', 'p_pearlmillet', 'p_soyabean', 'p_pigeonpeas', 'p_cotton', 'p_nkhwani', 'p_cassava',  'p_sugarcane',  'p_tomatoes',  'p_therereokra', 'p_tanaposi']].describe()
print('') 
print('===============================================================')
print('Check: Distribution of prices')
print('===============================================================')  
sum_prices.dropna(axis=1)
    


Check: Distribution of prices


Unnamed: 0,p_maize,p_groundnut,p_sweetpotatoe,p_pigeonpeas,p_tomatoes
count,12.0,35.0,3.0,27.0,2.0
mean,128.148148,122.244321,86.666667,282.750088,300.0
std,40.660466,28.097649,35.118846,289.100375,282.842712
min,77.777778,80.0,50.0,20.0,100.0
25%,100.0,100.0,70.0,192.4,200.0
50%,110.0,120.0,90.0,200.0,300.0
75%,145.0,134.285714,105.0,245.0,400.0
max,200.0,200.0,120.0,1200.0,500.0


In [15]:
#Get monetary value:
for crop in list_crops:
    data['y_'+crop] =  np.nanmedian(data['p_'+crop])*data['total_kg_'+crop].fillna(0)
    data['y_agric'] += data['y_'+crop].fillna(0)

#to dollars:

data[['y_agric', 'y_maize', 'y_groundnut', 'y_pigeonpeas']] = 0.0013*data[['y_agric', 'y_maize', 'y_groundnut', 'y_pigeonpeas']]

    
sum_y = data[['y_agric','y_maize', 'y_groundnut', 'y_pigeonpeas']].describe(percentiles=percentiles)
    
## Descriptive statistics
sellings = data[['hhid','soldvaluemaize','soldvaluegroundnut', 'p_maize', 'p_groundnut']]


print('TO CHECK THE WHOLE DATASET AND NOT JUST SUMMARIES, CHECK FILE income_data.csv. WE SHOULD DO SO TO EXPLORE THE POSSIBLE OUTLIERS, OTHER UNITS ISSUES, ETC.')
data.to_csv('income_data.csv')

  r = func(a, **kwargs)


TO CHECK THE WHOLE DATASET AND NOT JUST SUMMARIES, CHECK FILE income_data.csv. WE SHOULD DO SO TO EXPLORE THE POSSIBLE OUTLIERS, OTHER UNITS ISSUES, ETC.


In [16]:
# =============================================================================
# Data per plot
# =============================================================================
data.drop((data.loc[(data['hhid']==216) & (data['duration']==3666)]).index, inplace=True)
data = data.stack().apply(pd.to_numeric, errors='ignore').fillna(0).unstack()

### I get a problem since a hosuehold was interviewed twice:
data_216 = data.loc[data['hhid']==10, ]  ## SOLVED: remove one entry

# Generate empty dataset
N_p= np.sum(data['total_plots'])
ones = np.ones((int(N_p),2))
data_plots = pd.DataFrame({'hhid':ones[:,0], 'plotid':ones[:,1]})

## Populate dataset with hhid and plotid
i=-1
for hhid in data['hhid']:
    for plot in range(1,int(data.loc[data['hhid']==hhid, 'total_plots'])+1):
        i+=1
        data_plots.iloc[i,0] = hhid
        data_plots.iloc[i,1] = plot
 
       
## generate variables:
# List of chosen crops.. If not chosen then the variables associated to not-chosen crop are unexistent. Update this list
# Everytime we get new data. Check sum_kg for a quick selection.
list_crops_selected = ['maize', 'groundnut', 'sorghum',  'pigeonpeas'] 
# Code also works with all the crops. This is just to avoid empty columns.

for crop in list_crops:
    data_plots[crop+'_kg'] = np.nan
    

data_plots['area'] = np.nan  #area is already converted in acres
data_plots['rentoutplot'] = np.nan
data_plots['valueplot'] = np.nan
data_plots['kg_fertilizer'] = np.nan

#### Loop for plot characteristics
i=-1
for hhid in data['hhid']:
    for plot in range(1,int(data.loc[data['hhid']==hhid, 'total_plots'])+1):
        i+=1
        data_plots.iloc[i, data_plots.columns.get_loc('area')] = float(data.loc[data['hhid']==hhid, 'area_plot_acr_'+str(plot)])
        ## problem: area of rented-in plots. In this case the one with area=0
        data_plots.iloc[i, data_plots.columns.get_loc('rentoutplot')] = float(data.loc[data['hhid']==hhid, 'rentoutplot_'+str(plot)])
        data_plots.iloc[i, data_plots.columns.get_loc('valueplot')] = float(data.loc[data['hhid']==hhid, 'valueplot_'+str(plot)])
        
#### Looop for fertilizer
i=-1
for hhid in data['hhid']:
    for plot in range(1,int(data.loc[data['hhid']==hhid, 'repeatplotsfertilizer_count'])+1):
        i+=1        
        data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'fertilizerplotsselected_'+str(plot)]))),'kg_fertilizer'] = float(data.loc[data['hhid']==hhid, 'plotkgfertilizer_'+str(plot)])
        
 
#### Loop for crop production       

for hhid in data['hhid']:
        for crop in list_crops:
            for plot in range(1,int(data.loc[data['hhid']==hhid, 'repeatplots'+crop+'_count'])+1):
                #print(data.loc[data['hhid']==hhid,crop+'perplot_'+str(plot)]*crop_unit.loc[int(data.loc[data['hhid']==hhid, 'unitsplot'+crop+'_'+str(plot)]),'conversionkg'])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,crop+'plotsselected_'+str(plot)]))), crop+'_kg'] = float(data.loc[data['hhid']==hhid,crop+'perplot_'+str(plot)]*crop_unit.loc[int(data.loc[data['hhid']==hhid, 'unitsplot'+crop+'_'+str(plot)]),'conversionkg'])
                #data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,crop+'plotsselected_'+str(plot)]))), crop+'_kg'] = i


#### Loop for labor input 
for member in range(1,int(np.max(data['manyhhlaborplot'])+1)):    
    data_plots['months_member_'+str(member)] = np.nan           
    data_plots['weeks_member_'+str(member)] = np.nan             
    data_plots['days_member_'+str(member)] = np.nan 
    data_plots['hours_member_'+str(member)] = np.nan 
    data_plots['hours_member_'+str(member)] = np.nan
for hhid in data['hhid']:
    for member in range(1,int(data.loc[data['hhid']==hhid, 'manyhhlaborplot'])+1):
        for plot in range(1,int(data.loc[data['hhid']==hhid, 'hhlaborperplotrepeat_count_'+str(member)])+1):
                          
            #print(data.loc[data['hhid']==hhid,crop+'perplot_'+str(plot)]*crop_unit.loc[int(data.loc[data['hhid']==hhid, 'unitsplot'+crop+'_'+str(plot)]),'conversionkg'])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'months_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'monthshhplot_'+str(member)+'_'+str(plot)])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'weeks_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'weekshhplot_'+str(member)+'_'+str(plot)])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'days_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'dayshhplot_'+str(member)+'_'+str(plot)])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'hours_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'hourshhplot_'+str(member)+'_'+str(plot)])

sum_member1 = data_plots[['months_member_1', 'weeks_member_1', 'days_member_1', 'hours_member_1']].describe(percentiles=percentiles)
print('===============================================================')
print('reported months,...,hours member 1')
print('===============================================================')
sum_member1
### STOP RUN

TypeError: cannot convert the series to <class 'int'>

In [17]:
data.drop((data.loc[(data['hhid']==216) & (data['duration']==3666)]).index, inplace=True)


In [18]:
# Generate empty dataset
N_p= np.sum(data['total_plots'])
ones = np.ones((int(N_p),2))
data_plots = pd.DataFrame({'hhid':ones[:,0], 'plotid':ones[:,1]})

## Populate dataset with hhid and plotid
i=-1
for hhid in data['hhid']:
    for plot in range(1,int(data.loc[data['hhid']==hhid, 'total_plots'])+1):
        i+=1
        data_plots.iloc[i,0] = hhid
        data_plots.iloc[i,1] = plot
 
       
## generate variables:
# List of chosen crops.. If not chosen then the variables associated to not-chosen crop are unexistent. Update this list
# Everytime we get new data. Check sum_kg for a quick selection.
list_crops_selected = ['maize', 'groundnut', 'sorghum',  'pigeonpeas'] 
# Code also works with all the crops. This is just to avoid empty columns.

for crop in list_crops:
    data_plots[crop+'_kg'] = np.nan
    

data_plots['area'] = np.nan  #area is already converted in acres
data_plots['rentoutplot'] = np.nan
data_plots['valueplot'] = np.nan
data_plots['kg_fertilizer'] = np.nan

#### Loop for plot characteristics
i=-1
for hhid in data['hhid']:
    for plot in range(1,int(data.loc[data['hhid']==hhid, 'total_plots'])+1):
        i+=1
        data_plots.iloc[i, data_plots.columns.get_loc('area')] = float(data.loc[data['hhid']==hhid, 'area_plot_acr_'+str(plot)])
        ## problem: area of rented-in plots. In this case the one with area=0
        data_plots.iloc[i, data_plots.columns.get_loc('rentoutplot')] = float(data.loc[data['hhid']==hhid, 'rentoutplot_'+str(plot)])
        data_plots.iloc[i, data_plots.columns.get_loc('valueplot')] = float(data.loc[data['hhid']==hhid, 'valueplot_'+str(plot)])
        
#### Looop for fertilizer
i=-1
for hhid in data['hhid']:
    for plot in range(1,int(data.loc[data['hhid']==hhid, 'repeatplotsfertilizer_count'])+1):
        i+=1        
        data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'fertilizerplotsselected_'+str(plot)]))),'kg_fertilizer'] = float(data.loc[data['hhid']==hhid, 'plotkgfertilizer_'+str(plot)])
        
 
#### Loop for crop production       

for hhid in data['hhid']:
        for crop in list_crops:
            for plot in range(1,int(data.loc[data['hhid']==hhid, 'repeatplots'+crop+'_count'])+1):
                #print(data.loc[data['hhid']==hhid,crop+'perplot_'+str(plot)]*crop_unit.loc[int(data.loc[data['hhid']==hhid, 'unitsplot'+crop+'_'+str(plot)]),'conversionkg'])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,crop+'plotsselected_'+str(plot)]))), crop+'_kg'] = float(data.loc[data['hhid']==hhid,crop+'perplot_'+str(plot)]*crop_unit.loc[int(data.loc[data['hhid']==hhid, 'unitsplot'+crop+'_'+str(plot)]),'conversionkg'])
                #data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,crop+'plotsselected_'+str(plot)]))), crop+'_kg'] = i


#### Loop for labor input 
for member in range(1,int(np.max(data['manyhhlaborplot'])+1)):    
    data_plots['months_member_'+str(member)] = np.nan           
    data_plots['weeks_member_'+str(member)] = np.nan             
    data_plots['days_member_'+str(member)] = np.nan 
    data_plots['hours_member_'+str(member)] = np.nan 
    data_plots['hours_member_'+str(member)] = np.nan
for hhid in data['hhid']:
    for member in range(1,int(data.loc[data['hhid']==hhid, 'manyhhlaborplot'])+1):
        for plot in range(1,int(data.loc[data['hhid']==hhid, 'hhlaborperplotrepeat_count_'+str(member)])+1):
                          
            #print(data.loc[data['hhid']==hhid,crop+'perplot_'+str(plot)]*crop_unit.loc[int(data.loc[data['hhid']==hhid, 'unitsplot'+crop+'_'+str(plot)]),'conversionkg'])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'months_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'monthshhplot_'+str(member)+'_'+str(plot)])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'weeks_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'weekshhplot_'+str(member)+'_'+str(plot)])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'days_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'dayshhplot_'+str(member)+'_'+str(plot)])
                data_plots.loc[(data_plots['hhid']==hhid) & (data_plots['plotid']== int(float(data.loc[data['hhid']==hhid,'hhlaborplotsselected_'+str(member)+'_'+str(plot)]))), 'hours_member_'+str(member)] = float(data.loc[data['hhid']==hhid,'hourshhplot_'+str(member)+'_'+str(plot)])

sum_member1 = data_plots[['months_member_1', 'weeks_member_1', 'days_member_1', 'hours_member_1']].describe(percentiles=percentiles)
print('===============================================================')
print('reported months,...,hours member 1')
print('===============================================================')
sum_member1
### STOP RUN


reported months,...,hours member 1


Unnamed: 0,months_member_1,weeks_member_1,days_member_1,hours_member_1
count,216.0,216.0,216.0,216.0
mean,5.638889,3.481481,5.064815,4.097222
std,1.714281,0.969118,1.753525,1.748033
min,0.0,0.0,0.0,0.0
5%,2.0,1.0,2.0,1.0
10%,3.0,2.0,2.0,2.0
25%,5.0,3.0,4.75,3.0
50%,6.0,4.0,6.0,4.0
75%,7.0,4.0,6.0,5.0
80%,7.0,4.0,6.0,5.0


In [19]:
sum_member2 = data_plots[['months_member_2', 'weeks_member_2', 'days_member_2', 'hours_member_2']].describe(percentiles=percentiles)
print('===============================================================')
print('reported months,...,hours member 2')
print('===============================================================')
sum_member2

reported months,...,hours member 2


Unnamed: 0,months_member_2,weeks_member_2,days_member_2,hours_member_2
count,174.0,174.0,174.0,174.0
mean,5.5,3.603448,4.442529,3.867816
std,1.695796,0.845382,1.81981,1.38939
min,0.0,0.0,0.0,0.0
5%,2.0,2.0,2.0,2.0
10%,3.0,2.3,2.0,2.0
25%,5.0,4.0,2.0,3.0
50%,6.0,4.0,5.0,4.0
75%,7.0,4.0,6.0,5.0
80%,7.0,4.0,6.0,5.0


In [20]:
sum_member3 = data_plots[['months_member_3', 'weeks_member_3', 'days_member_3', 'hours_member_3']].describe(percentiles=percentiles)
print('===============================================================')
print('reported months,...,hours member 3')
print('===============================================================')
sum_member3

reported months,...,hours member 3


Unnamed: 0,months_member_3,weeks_member_3,days_member_3,hours_member_3
count,96.0,96.0,96.0,96.0
mean,5.302083,3.552083,3.59375,3.15625
std,2.037128,1.044986,1.938953,1.571644
min,0.0,0.0,0.0,1.0
5%,1.0,1.0,1.75,1.0
10%,2.0,2.0,2.0,1.0
25%,5.0,4.0,2.0,2.0
50%,6.0,4.0,3.0,3.0
75%,7.0,4.0,6.0,4.0
80%,7.0,4.0,6.0,4.0


In [21]:

data_plots['hh_labor_days'] = 0
data_plots['hh_labor_hours'] = 0

for member in range(1,int(np.max(data['manyhhlaborplot'])+1)):           
    data_plots['member_'+str(member)+'_labor_days'] =  (data_plots['months_member_'+str(member)].multiply(data_plots['weeks_member_'+str(member)],axis=0, fill_value=0)).multiply(data_plots['days_member_'+str(member)],axis=0, fill_value=0)  
    data_plots['member_'+str(member)+'_labor_hours'] = data_plots['member_'+str(member)+'_labor_days'].multiply(data_plots['days_member_'+str(member)],axis=0, fill_value=0)

for member in range(1,int(np.max(data['manyhhlaborplot'])+1)): 
    data_plots['hh_labor_days'] += data_plots['member_'+str(member)+'_labor_days']
    data_plots['hh_labor_hours'] += data_plots['member_'+str(member)+'_labor_hours']
    
    
print('===============================================================')
print('Distribution Household Labor in days')
print('===============================================================')
sum_labor_days = data_plots[['hh_labor_days', 'member_1_labor_days', 'member_2_labor_days', 'member_3_labor_days']].describe(percentiles=percentiles)
sum_labor_days


Distribution Household Labor in days


Unnamed: 0,hh_labor_days,member_1_labor_days,member_2_labor_days,member_3_labor_days
count,23.0,216.0,174.0,96.0
mean,362.826087,105.240741,94.212644,73.489583
std,213.852467,55.340699,54.24357,51.012691
min,0.0,0.0,0.0,0.0
5%,6.4,12.0,5.3,1.5
10%,84.0,24.0,24.0,4.0
25%,217.0,56.0,51.0,40.0
50%,370.0,120.0,96.0,56.0
75%,521.5,144.0,144.0,114.0
80%,556.4,168.0,144.0,120.0


In [22]:
print('PREVIOUS CELL IS WRONG. NOTE THAT NUMBER OF OBSERVATIONS HH LABOR IS TOO LOW. BELOW IS THE CORRECT TABLE:')

PREVIOUS CELL IS WRONG. NOTE THAT NUMBER OF OBSERVATIONS HH LABOR IS TOO LOW. BELOW IS THE CORRECT TABLE:


In [23]:

data_plots['hh_labor_days'] = 0
data_plots['hh_labor_hours'] = 0

for member in range(1,int(np.max(data['manyhhlaborplot'])+1)):           
    data_plots['member_'+str(member)+'_labor_days'] =  (data_plots['months_member_'+str(member)].multiply(data_plots['weeks_member_'+str(member)],axis=0, fill_value=0)).multiply(data_plots['days_member_'+str(member)],axis=0, fill_value=0)  
    data_plots['member_'+str(member)+'_labor_hours'] = data_plots['member_'+str(member)+'_labor_days'].multiply(data_plots['days_member_'+str(member)],axis=0, fill_value=0)

for member in range(1,int(np.max(data['manyhhlaborplot'])+1)): 
    
    data_plots['hh_labor_days'] += data_plots['member_'+str(member)+'_labor_days'].fillna(0)
    data_plots['hh_labor_hours'] += data_plots['member_'+str(member)+'_labor_hours'].fillna(0)
    
    
print('===============================================================')
print('Distribution Household Labor in days')
print('===============================================================')
sum_labor_days = data_plots[['hh_labor_days', 'member_1_labor_days', 'member_2_labor_days', 'member_3_labor_days']].describe(percentiles=percentiles)
sum_labor_days

print('===============================================================')
print('Distribution Household Labor in hours')
print('===============================================================')
sum_labor_hours = data_plots[['hh_labor_hours', 'member_1_labor_hours', 'member_2_labor_hours', 'member_3_labor_hours']].describe(percentiles=percentiles)
sum_labor_hours


Distribution Household Labor in days
Distribution Household Labor in hours


Unnamed: 0,hh_labor_hours,member_1_labor_hours,member_2_labor_hours,member_3_labor_hours
count,261.0,216.0,174.0,96.0
mean,1038.183908,599.768519,498.051724,339.260417
std,873.67514,377.627911,380.850194,347.180561
min,0.0,0.0,0.0,0.0
5%,0.0,24.0,8.0,3.0
10%,0.0,72.0,48.0,9.0
25%,320.0,252.0,112.0,80.0
50%,874.0,624.0,500.0,112.0
75%,1520.0,893.0,864.0,591.0
80%,1728.0,1008.0,864.0,720.0


In [24]:
print('===============================================================')
print('Distribution Household Labor in days')
print('===============================================================')
sum_labor_days = data_plots[['hh_labor_days', 'member_1_labor_days', 'member_2_labor_days', 'member_3_labor_days']].describe(percentiles=percentiles)
sum_labor_days

Distribution Household Labor in days


Unnamed: 0,hh_labor_days,member_1_labor_days,member_2_labor_days,member_3_labor_days
count,261.0,216.0,174.0,96.0
mean,197.578544,105.240741,94.212644,73.489583
std,152.442469,55.340699,54.24357,51.012691
min,0.0,0.0,0.0,0.0
5%,0.0,12.0,5.3,1.5
10%,0.0,24.0,24.0,4.0
25%,88.0,56.0,51.0,40.0
50%,168.0,120.0,96.0,56.0
75%,288.0,144.0,144.0,114.0
80%,324.0,168.0,144.0,120.0


In [25]:
print('IN FILE data_plotlevel.csv there is hhid-plotid long format dataset. CHECK IT TO SEE IF VALUES MAKE SENSE. ESPECIALLY FERTILIZER!')
data_plots.to_csv('data_plotlevel.csv')

IN FILE data_plotlevel.csv there is hhid-plotid long format dataset. CHECK IT TO SEE IF VALUES MAKE SENSE. ESPECIALLY FERTILIZER!


In [26]:
data_plots_agg = data_plots.groupby(by='hhid').sum()
data_plots_agg.reset_index(inplace=True)

data_plots_agg = data_plots_agg[['hhid','maize_kg','groundnut_kg','sorghum_kg', 'pigeonpeas_kg', 'kg_fertilizer']]
data_plots_agg.columns = ['hhid','maize_kg','groundnut_kg','sorghum_kg', 'pigeonpeas_kg', 'kg_fertilizer']

data_kg_check = data[['hhid', 'intervieweename','hh_area_plots','total_kg_maize', 'total_kg_groundnut','total_kg_sorghum', 'total_kg_pigeonpeas', 'fertilizerkg','enumerator' ]]
data_kg_check['hhid'] = data_kg_check['hhid'].astype('float64')

data_kg_check = data_kg_check.merge(data_plots_agg, on='hhid')  



### create difference. report those households with big differences
list_crops_check = ['maize', 'groundnut', 'pigeonpeas']
for crop in list_crops_check:
    data_kg_check['check_diff_'+crop] = data_kg_check['total_kg_'+crop].fillna(0) - data_kg_check[crop+'_kg'].fillna(0)

data_kg_check['check_diff_fertilizer'] = data_kg_check['fertilizerkg'].fillna(0) - data_kg_check['kg_fertilizer'].fillna(0)

data_diff = data_kg_check[['hhid','intervieweename', 'enumerator','check_diff_maize','check_diff_groundnut','check_diff_pigeonpeas','check_diff_fertilizer']]

data_diff.replace([0,0.0], np.nan, inplace=True)
data_diff.dropna(subset=['check_diff_maize','check_diff_groundnut','check_diff_pigeonpeas','check_diff_fertilizer'], axis=0, how='all',inplace=True)

### These are the households to check:
print('')
print('===============================================================')
print('Check: Households that aggregate vs sum(plots) variables do not coincide')
print('===============================================================')
data_diff



Check: Households that aggregate vs sum(plots) variables do not coincide


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,hhid,intervieweename,enumerator,check_diff_maize,check_diff_groundnut,check_diff_pigeonpeas,check_diff_fertilizer
3,182.0,Niya Itimu,1,350.0,,,1.0
4,176.0,Idess Juma,5,,,,-98.0
11,3.0,Agness Idana,4,,,,-49.0
12,30.0,Eliza Idana,4,,,,-0.5
20,93.0,Lakia James,2,-50.0,,,
32,33.0,Hawa Chimwaza,5,,,,-12.0
39,7.0,Alesi Kawina,5,50.0,,,
40,135.0,Lukiya Alaki,4,,,-150.0,
53,41.0,Kulungano amidu,7,270.0,,,
61,68.0,Zione Piyasi,2,25.0,,,


In [27]:
sum_subsidy = data[['other_sour_income_3','other_sour_income_4']].describe(percentiles=percentiles)
print('===============================================================')
print('Conditional Cash Transfer Program Implementation in the Village.')
print('===============================================================')
sum_subsidy

Conditional Cash Transfer Program Implementation in the Village.


Unnamed: 0,other_sour_income_3,other_sour_income_4
count,138.0,138.0
unique,3.0,10.0
top,2.0,0.0
freq,121.0,129.0


In [28]:
## Need to reupload dataset since now was in string format.
data = pd.read_csv('income_data.csv')


In [29]:
sum_subsidy = data[['other_sour_income_3','other_sour_income_4']].describe(percentiles=percentiles)
print('===============================================================')
print('Conditional Cash Transfer Program Implementation in the Village.')
print('===============================================================')
sum_subsidy

Conditional Cash Transfer Program Implementation in the Village.


Unnamed: 0,other_sour_income_3,other_sour_income_4
count,139.0,139.0
mean,1.827338,1507.913669
std,0.495312,8808.996264
min,0.0,0.0
5%,0.9,0.0
10%,1.0,0.0
25%,2.0,0.0
50%,2.0,0.0
75%,2.0,0.0
80%,2.0,0.0
