# Using the Switching Regression to be Robust to Misclassification for Ethiopian Maize Adoption

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2
import pandas as pd
import numpy as np
from itertools import permutations
import sys
sys.path.append("../../uganda-uber-switching-reg/uganda_uber_switching_reg")
from mle import DriverSpecificProbUberMLE


In [4]:
df = pd.read_stata("../data/full_panel.zip")

trajectories = (
    df
    .dropna(subset= ['impmaize'])
    .groupby(['holder_id'])['impmaize']
    .agg(trajectories = list)
    .assign(len_traj = lambda df: df['trajectories'].apply(lambda x: len(x)))
    .query("len_traj == 3")
    .drop(['len_traj'], axis=1)
    .assign(trajectories = lambda df: df['trajectories'].astype(str))
    .pipe(pd.get_dummies)
    .rename(lambda x: x.replace('.0', '').replace(',', '').replace('[', '').replace(']', '').replace(' ', ''), axis=1)
    )

# merge with df

merged_df = (
    df
    .merge(trajectories, 
           left_on= ['holder_id'], 
           right_index=True)

    )




One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.


In [5]:
# create misclassification matrices for each trajectory

purity_95 = pd.DataFrame(data=[[.863, .1370],
                              [.2958, .7042]], index=[0, 1], 
                                columns=[0,1])

dna_dtm = pd.DataFrame(data=[[.9709, .0291],
                              [.8678, .1322]], index=[0,1], 
                                columns=[0,1])

dna_2010 = pd.DataFrame(data=[[.859, .141],
                              [.6281, .3719]], index=[0,1], 
                                columns=[0,1])

In [6]:
trajectory_nums = trajectories.columns.str.replace("trajectories_", '').tolist()

In [7]:
# Now the magic...
# Need to take these static probabilities and turn the into a probability for each trajectory
# So my confusion matrix for each will be 2^3 x 2^3

trajectory_nums = trajectories.columns.str.replace("trajectories_", '').tolist()

def confusion_matrix(misclass_mat):
    confusion_matrix = pd.DataFrame(index=trajectory_nums, columns=trajectory_nums)
    
    for traj_self_reported in trajectory_nums:
        for traj_misclassified in trajectory_nums:
            traj_int_self_reported = [int(i) for i in list(traj_self_reported)]
            traj_int_misclassified = [int(i) for i in list(traj_misclassified)]
            confusion_matrix.loc[traj_self_reported, traj_misclassified] = \
                misclass_mat.loc[traj_int_self_reported[0], traj_int_misclassified[0]] * \
                    misclass_mat.loc[traj_int_self_reported[1], traj_int_misclassified[1]] * \
                        misclass_mat.loc[traj_int_self_reported[2], traj_int_misclassified[2]]
    
    return confusion_matrix
    

In [8]:
confusion_matrix(purity_95)

Unnamed: 0,000,001,010,011,100,101,110,111
0,0.642736,0.102033,0.102033,0.016198,0.102033,0.016198,0.016198,0.002571
1,0.220303,0.524466,0.034973,0.083258,0.034973,0.083258,0.005552,0.013217
10,0.220303,0.034973,0.524466,0.083258,0.034973,0.005552,0.083258,0.013217
11,0.07551,0.179765,0.179765,0.42796,0.011987,0.028537,0.028537,0.067938
100,0.220303,0.034973,0.034973,0.005552,0.524466,0.083258,0.083258,0.013217
101,0.07551,0.179765,0.011987,0.028537,0.179765,0.42796,0.028537,0.067938
110,0.07551,0.011987,0.179765,0.028537,0.179765,0.028537,0.42796,0.067938
111,0.025882,0.061616,0.061616,0.146687,0.061616,0.146687,0.146687,0.349211


In [9]:
merged_df.columns.tolist()

['holder_id',
 'household_id',
 'seedtype',
 'harvest_kg',
 'parcesizeHA',
 'parcesizeHA_wiz',
 'urea_kg',
 'dap_kg',
 'hhlabor',
 'hiredmen',
 'hiredwomen',
 'hiredchildren',
 'hiredlabor',
 'labcostsmen',
 'labcostswomen',
 'labcostschildren',
 'laborcosts',
 'othhlabor',
 'hiredmen_harv',
 'hiredwomen_harv',
 'hiredchildren_harv',
 'hiredlabor_harv',
 'labcostsmen_harv',
 'labcostswomen_harv',
 'labcostschildren_harv',
 'laborcosts_harv',
 'hhlabor_harv',
 'othhlabor_harv',
 'cropcutfresh_KG',
 'cropcutdry_KG',
 'intercropshare',
 'damaged_share',
 'pctcropdamage',
 'intercrop',
 'prevented',
 'pesticide',
 'herbicide',
 'fungicide',
 'damaged',
 'wave',
 'cropmethod',
 'fieldslope',
 'irrigationsource',
 'durea',
 'ddap',
 'dcompost',
 'dorganicfert',
 'dirrigation',
 'typecrop',
 'lessthanplanted',
 'dcropdamage',
 'impmaize',
 'YIELD_cropcutfresh',
 'YIELD_cropcutdry',
 'YIELD_cropcutfresh_tr',
 'YIELD_cropcutdry_tr',
 'YIELD_selfr',
 'YIELD_selfr_tr',
 'impmaizehh',
 'age_head',

In [10]:
# Let's see what happens!

trajectory_df = (
    merged_df
    .set_index(['holder_id', 'wave'])
    .filter(like='trajectories_')
    .idxmax(axis=1)
    .reset_index()
    .rename({0 : 'trajectories'}, axis=1)
    .assign(trajectories = lambda df: df['trajectories'].str.replace('trajectories_', ''))
    .merge(merged_df, on = ['holder_id', 'wave'])
    )

In [56]:
full_df = (
    trajectory_df
    .merge(trajectory_df[['wave', 'trajectories', 'impmaize']]
           .drop_duplicates(), on=['wave'], 
           suffixes=('_true', '_misclass'))
    .set_index(['holder_id', 'wave', 'trajectories_misclass'])
    .query("YIELD_selfr_tr !=0")
    .assign(log_yield = lambda df: np.log(df['YIELD_selfr_tr']))
    .dropna(subset=['log_yield', 'impmaize_misclass', 'yrseduc', "age_head", "sex_head",  'title', "parcesizeHA", "hhlabor", "hiredlabor"])
    )

In [57]:
classifier_pred = (
    full_df
    .filter(like='trajectories_')
    .reset_index()
    .drop(columns=['trajectories_misclass', 'trajectories_true'])
    .drop_duplicates()
    .set_index(['holder_id', 'wave'])
    )

In [59]:
mod = DriverSpecificProbUberMLE.from_formula("np.arcsinh(YIELD_selfr_tr) ~ 1 + impmaize_misclass + yrseduc+ age_head+ sex_head + title + parcesizeHA + hhlabor + hiredlabor", 
                                       data=full_df, 
                                       classifier_pred = classifier_pred,
                                       check_absorbed=False,
                                       cm = confusion_matrix(dna_dtm).values.T)

sr, pols = mod.fit(method='bfgs', cov_type='cluster', 
        cov_kwds = {'groups':full_df.reset_index('trajectories_misclass').query("trajectories_true==trajectories_misclass").trajectories_true})

Initializing...
Creating starting values...
Optimizing...


  return np.log((rnl*(class_ind@cm.T)).sum(axis=1))
  grad[k, :] = (f(*((x+ei,)+args), **kwargs) -
  return np.log((rnl*(class_ind@cm.T)).sum(axis=1))


Optimization terminated successfully.
         Current function value: 2.656954
         Iterations: 45
         Function evaluations: 58
         Gradient evaluations: 51




In [60]:
sr.summary()

0,1,2,3
Dep. Variable:,np.arcsinh(YIELD_selfr_tr),Log-Likelihood:,-8316.3
Model:,DriverSpecificProbUberMLE,AIC:,16650.0
Method:,Maximum Likelihood,BIC:,16700.0
Date:,"Thu, 07 Jul 2022",,
Time:,17:37:40,,
No. Observations:,3130,,
Df Residuals:,3121,,
Df Model:,8,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.5061,0.218,20.651,0.000,3.990,5.022
age_head,0.0051,0.004,1.310,0.231,-0.004,0.014
hhlabor,-0.0020,0.002,-1.175,0.278,-0.006,0.002
hiredlabor,0.0005,0.001,0.448,0.668,-0.002,0.003
impmaize_misclass,1.9257,0.110,17.569,0.000,1.667,2.185
parcesizeHA,-0.0785,0.013,-5.830,0.001,-0.110,-0.047
sex_head,-0.0831,0.090,-0.922,0.387,-0.296,0.130
title,0.9415,0.076,12.454,0.000,0.763,1.120
yrseduc,0.0598,0.019,3.089,0.018,0.014,0.106


In [44]:
pols.summary

0,1,2,3
Dep. Variable:,np.arcsinh(YIELD_cropcutdry_tr),R-squared:,0.0102
Estimator:,PanelOLS,R-squared (Between):,0.0194
No. Observations:,3255,R-squared (Within):,0.0052
Date:,"Thu, Jul 07 2022",R-squared (Overall):,0.0102
Time:,17:30:44,Log-likelihood,-8461.0
Cov. Estimator:,Unadjusted,,
,,F-statistic:,4.1637
Entities:,1116,P-value,0.0001
Avg Obs:,2.9167,Distribution:,"F(8,3246)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,2.0205,0.2762,7.3148,0.0000,1.4789,2.5621
age_head,0.0044,0.0043,1.0223,0.3067,-0.0041,0.0129
hhlabor,0.0015,0.0010,1.5616,0.1185,-0.0004,0.0035
hiredlabor,0.0021,0.0016,1.3101,0.1903,-0.0010,0.0052
impmaize_misclass,-0.0034,0.1417,-0.0242,0.9807,-0.2813,0.2744
parcesizeHA,0.0410,0.0404,1.0145,0.3104,-0.0382,0.1202
sex_head,-0.4369,0.1629,-2.6822,0.0074,-0.7563,-0.1175
title,0.4511,0.1201,3.7546,0.0002,0.2155,0.6866
yrseduc,0.0218,0.0215,1.0124,0.3114,-0.0204,0.0640
