In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
import re
from tqdm.notebook import tqdm
from numba import jit

In [2]:
df = pd.read_stata("../ethiopia_data/data/processed/full_panel.dta")

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


In [3]:
df['crop_code'].dtype

CategoricalDtype(categories=[          'BARLEY',            'MAIZE',           'MILLET',
                              'OATS',             'RICE',          'SORGHUM',
                              'TEFF',            'WHEAT',            'ROMAN',
                           'CASSAVA',
                  ...
                       'TIMIZ KIMEM',     'OTHER SPICES',     'OTHER PULSES',
                    'OTHER OIL SEED',     'OTHER CEREAL', 'OTHER CASH CROPS',
                            'OTHERS',  'OTHER VEGETABLE',                124,
                                 126],
, ordered=True)

In [4]:
trajectories = (
    df
    .query("crop_code == 'MAIZE'")
    .dropna(subset= ['impmaize'])
    .groupby(['holder_id', 'parcel_id', 'field_id'])['impmaize']
    .agg(trajectories = list)
    .assign(len_traj = lambda df: df['trajectories'].apply(lambda x: len(x)))
    .query("len_traj == 3")
    .drop(['len_traj'], axis=1)
    .assign(trajectories = lambda df: df['trajectories'].astype(str))
    .pipe(pd.get_dummies)
    .rename(lambda x: x.replace('.0', '').replace(',', '').replace('[', '').replace(']', '').replace(' ', ''), axis=1)
    )

# merge with df

merged_df = (
    df
    .query("crop_code == 'MAIZE'")
    .merge(trajectories, 
           left_on= ['holder_id', 'parcel_id', 'field_id'], 
           right_index=True)

    )



In [5]:
merged_df.columns[merged_df.columns.str.contains("trajectories_")][1:7]

Index(['trajectories_001', 'trajectories_010', 'trajectories_011',
       'trajectories_100', 'trajectories_101', 'trajectories_110'],
      dtype='object')

In [6]:
reg_dict = {}

outcomes = merged_df.columns[merged_df.columns.str.contains('YIELD')].tolist()
h_switchers = merged_df.columns[merged_df.columns.str.contains("trajectories_")][1:7].tolist()
h_switchers_int = [f"{i}:impmaize" for i in h_switchers]
h_no_always = merged_df.columns[merged_df.columns.str.contains("trajectories_")][0:7].tolist()

merged_df_dropna = merged_df.dropna(subset= outcomes + ['impmaize'] + h_no_always + h_switchers)

for y in outcomes:

    reg_dict[y] = smf.ols(f"np.arcsinh({y}) ~ -1 + {' + '.join(h_no_always)} + {' + '.join(h_switchers_int)}", 
                          data = merged_df_dropna)

In [7]:
# Now run weak-id test
reg_res_dict = {}

for y, mod in reg_dict.items():
    print(f"Trying {y}")
    res = mod.fit(cov_type = 'cluster', cov_kwds = {'groups' : merged_df_dropna['holder_id']})
    reg_res_dict[y] = res


Trying YIELD_cropcutfresh
Trying YIELD_cropcutdry
Trying YIELD_cropcutfresh_tr
Trying YIELD_cropcutdry_tr
Trying YIELD_selfr
Trying YIELD_selfr_tr


In [8]:
np.arange(-50000, 50000, 0.1).size

1000000

In [100]:
def weak_id_test(res, start=-100, stop=100, inc=0.1):
    trajectories = np.array(["010", "011", "100", "101", "110"])
    ranger = np.arange(start, stop, inc)
    mat = np.zeros((ranger.size, trajectories.size))
    for i, phi in enumerate(tqdm(ranger)):
        for j, traj in enumerate(trajectories):
            test = f"trajectories_{traj} - trajectories_001 = {phi}*(trajectories_{traj}:impmaize - trajectories_001:impmaize)"
            mat[i, j] = res.t_test(test).pvalue
            
    df= pd.DataFrame(columns = trajectories,
                 index = pd.Index(ranger),
                 data=mat)
            
    return df

def weak_id_joint_test(res, start=-100, stop=100, inc=0.1):
    
    trajectories = np.array(["010", "011", "100", "101", "110"])
    ranger = np.arange(start, stop, inc)
    mat = np.zeros(ranger.size)
    
    for i, phi in enumerate(tqdm(ranger)):
        joint_test_list = [f"(trajectories_{traj} - trajectories_001 = {phi}*(trajectories_{traj}:impmaize - trajectories_001:impmaize))" \
            for traj in trajectories]
    
        joint_test = ' , '.join(joint_test_list)
        mat[i] = res.f_test(joint_test).pvalue

    df= pd.DataFrame(columns = ['joint'],
                 index = pd.Index(ranger),
                 data=mat)
            
    return df

def phi_ci(weak_id_df):
    
    phi_p_min = weak_id_df[weak_id_df.apply(lambda x: x > 0.05)].min()
    phi_p_max = weak_id_df[weak_id_df.apply(lambda x: x > 0.05)].max()
    
    phi_df = pd.DataFrame(
        index = ['min', 'max'],
        columns = weak_id_df.columns
    )
    
    for col, mi, ma in zip(weak_id_df.columns, phi_p_min, phi_p_max):
        try:
            phi_df.loc['min', col] = weak_id_df.index[weak_id_df[col] == mi].values[0]
            phi_df.loc['max', col] = weak_id_df.index[weak_id_df[col] == ma].values[0]
        except IndexError:
            print(f"""Might be NaNs: 
                  phi_min = {phi_p_min.values[0]}
                  phi_max = {phi_p_max.values[0]}
                  """)
        
    return phi_df
        

In [76]:
weak_id = weak_id_test(reg_res_dict['YIELD_cropcutfresh'])

  0%|          | 0/2000 [00:00<?, ?it/s]

In [91]:
weak_id_joint = weak_id_joint_test(reg_res_dict['YIELD_cropcutfresh'])

  0%|          | 0/2000 [00:00<?, ?it/s]



In [99]:
phi_ci(weak_id_joint)

Might be NaNs: 
                  phi_min = nan
                  phi_max = nan
                  


Unnamed: 0,joint
min,
max,


In [95]:
weak_id_joint[weak_id_joint.apply(lambda x: x > 0.05)].

joint   NaN
dtype: float64