In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from functools import reduce
import statsmodels.formula.api as smf
from pkg import pearson_corr, nakagawa_r2, detrend_group
import os
os.chdir("/Users/caropark/FAO_ag_check_code/")

In [2]:
gdp = pd.read_csv("./data/wb_gdp_per_cap.csv").rename({'value': 'gdp'}, axis=1)
gdp = gdp.groupby(['iso_a3']).agg(avg_gdp = ('gdp', 'mean')).reset_index()

flag = pd.read_csv("./data/faostat_all_flags.csv").rename({"Year": 'year'}, axis=1)
flag = flag.groupby(["iso_a3", 'cropname'])[['year']].agg(flag_sum = ('year', 'count')).reset_index()

modis = pd.read_csv("./data/modis_vars.csv")[['total_harvarea', 'cropland_fraction', 'cropname', 'iso_a3']]

# corrupt = pd.read_csv("./data/corruption.csv").filter(regex= "CPI|Country|ISO3")
# corrupt = pd.melt(corrupt, id_vars=['Country', "ISO3"]).drop("variable", axis=1)
# corrupt.columns= ["corr_country", "iso_a3", "corrupt_score"]
# country_key= pd.read_csv("./data/country_key.csv")[['iso_a3']]
# corrupt = corrupt.merge(country_key, how="left", on="iso_a3").iloc[:,1:].groupby("iso_a3").mean().reset_index()

clim = (pd.read_csv("./data/dt_clim_vars.csv").groupby(['iso_a3', 'cropname'])[['sm', 'tmax']]
        .agg(avg_sm = ('sm', 'mean'), avg_tmax= ('tmax', 'mean')).reset_index())

In [None]:
# yields = pd.read_csv("./data/yield_comparison.csv")[["cropname", "year", "yield", "csif", "whichlag", "country", "iso_a3"]]
# yield_r2 = yields.groupby(['iso_a3', 'cropname', 'whichlag']).apply(lambda group: pd.Series({"r2": pearson_corr(group, x="csif",y="yield")**2})).reset_index()
# yield_r2 = yield_r2[yield_r2['r2']!=1]


In [3]:
yields = pd.read_csv("./data/yield_comparison.csv")[["cropname", "year", "yield_og", "csif_og", "whichlag", "country", "iso_a3"]]
yields = detrend_group(yields, 'yield_og', 'yield_log_dt', log_transform=True)
yields = detrend_group(yields, 'csif_og', 'csif_log_dt', log_transform=True)
yield_r2 = yields.groupby(['iso_a3', 'cropname', 'whichlag']).apply(lambda group: pd.Series({"r2": pearson_corr(group, x="csif_log_dt",y="yield_log_dt")**2})).reset_index()
yield_r2 = yield_r2[yield_r2['r2']!=1]


In [4]:
def flexible_merge(left, right):
    keys = ["iso_a3"]
    if "cropname" in left.columns and "cropname" in right.columns:
        keys.append("cropname")
    return pd.merge(left, right, on=keys, how="outer")

dfs = [yield_r2, gdp, flag, modis, clim]

merged = reduce(flexible_merge, dfs).dropna(subset=["r2","avg_sm"]).drop_duplicates().reset_index(drop=True)
merged["flag_sum"] = merged["flag_sum"].fillna(0)

coeffs = ["avg_gdp","flag_sum","total_harvarea","cropland_fraction","avg_sm","avg_tmax", "whichlag"]
merged = merged.dropna(subset=coeffs.copy().extend(["r2", "cropname"])).reset_index(drop=True)


In [7]:
merged[coeffs].drop(columns="whichlag")


Unnamed: 0,avg_gdp,flag_sum,total_harvarea,cropland_fraction,avg_sm,avg_tmax
0,415.494946,0.0,49.195750,0.044689,0.192102,19.485820
1,415.494946,0.0,37.996323,0.044689,0.131341,28.793948
2,415.494946,11.0,10.653798,0.044584,0.116790,27.590095
3,415.494946,0.0,92.858919,0.044689,0.141827,27.841444
4,415.494946,20.0,1.200465,0.044535,0.138429,29.859067
...,...,...,...,...,...,...
1739,1197.225299,1.0,50.380743,0.047256,0.191992,27.177286
1740,1197.225299,0.0,19.508768,0.045191,0.209904,27.364554
1741,1197.225299,0.0,9.823858,0.056159,0.239850,27.077391
1742,1197.225299,19.0,0.232696,0.045161,0.148736,27.714592


In [8]:
merged[coeffs].drop(columns="whichlag").agg(['min', 'max'])
scaled = merged[coeffs].drop(columns="whichlag").apply(lambda x: (x - x.mean()) / x.std()).add_suffix("_z")
merged = merged.drop(columns=merged[coeffs].drop(columns="whichlag")).join(scaled)
merged

Unnamed: 0,iso_a3,cropname,whichlag,r2,avg_gdp_z,flag_sum_z,total_harvarea_z,cropland_fraction_z,avg_sm_z,avg_tmax_z
0,AFG,Barley,yield_dt,0.129926,-0.63623,-0.665275,-0.157670,-0.727013,-0.926261,-1.086311
1,AFG,Maize,yield_lag,0.004224,-0.63623,-0.665275,-0.169423,-0.727013,-1.975090,0.440403
2,AFG,Millet,yield_lead,0.053800,-0.63623,0.838866,-0.198118,-0.727560,-2.226270,0.242947
3,AFG,Potatoes,yield_lag,0.009843,-0.63623,-0.665275,-0.111847,-0.727013,-1.794098,0.284174
4,AFG,Pulses nes,yield_lag,0.012962,-0.63623,2.069526,-0.208039,-0.727815,-1.852744,0.615103
...,...,...,...,...,...,...,...,...,...,...
1739,ZWE,Sorghum,yield_dt,0.071812,-0.58857,-0.528535,-0.156426,-0.713676,-0.928160,0.175239
1740,ZWE,Soybeans,yield_lag,0.099182,-0.58857,-0.665275,-0.188825,-0.724405,-0.618977,0.205954
1741,ZWE,Sunflower seed,yield_lag,0.000097,-0.58857,-0.665275,-0.198989,-0.667429,-0.102059,0.158854
1742,ZWE,Sweet potatoes,yield_dt,0.005737,-0.58857,1.932786,-0.209055,-0.724559,-1.674833,0.263367


In [9]:
mod_yields_r2_gdp = smf.mixedlm("r2 ~ avg_gdp_z",
                     merged, groups="cropname" )

print(mod_yields_r2_gdp.fit().summary())
nakagawa_r2(mod_yields_r2_gdp.fit())


        Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: r2      
No. Observations: 1744    Method:             REML    
No. Groups:       19      Scale:              0.0220  
Min. group size:  47      Log-Likelihood:     829.3690
Max. group size:  150     Converged:          Yes     
Mean group size:  91.8                                
------------------------------------------------------
             Coef. Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------
Intercept    0.106    0.009 11.333 0.000  0.087  0.124
avg_gdp_z    0.017    0.004  4.643 0.000  0.010  0.024
cropname Var 0.001    0.004                           

Marginal R² = 0.05872018365132591
Conditional R² = 0.1147891385784761




In [10]:
mod_yields_r2 = smf.mixedlm("r2 ~ avg_gdp_z + flag_sum_z + whichlag + total_harvarea_z + cropland_fraction_z  + avg_sm_z + avg_tmax_z",
                     merged, groups="cropname" )
print(mod_yields_r2.fit().summary())
nakagawa_r2(mod_yields_r2.fit())

              Mixed Linear Model Regression Results
Model:                MixedLM     Dependent Variable:     r2      
No. Observations:     1744        Method:                 REML    
No. Groups:           19          Scale:                  0.0188  
Min. group size:      47          Log-Likelihood:         941.5102
Max. group size:      150         Converged:              Yes     
Mean group size:      91.8                                        
------------------------------------------------------------------
                       Coef.  Std.Err.    z    P>|z| [0.025 0.975]
------------------------------------------------------------------
Intercept               0.157    0.008  19.513 0.000  0.141  0.173
whichlag[T.yield_lag]  -0.088    0.008 -10.508 0.000 -0.104 -0.071
whichlag[T.yield_lead] -0.099    0.008 -12.147 0.000 -0.116 -0.083
avg_gdp_z               0.009    0.004   2.333 0.020  0.001  0.016
flag_sum_z             -0.006    0.004  -1.561 0.119 -0.013  0.001
total_harv



In [8]:
res = mod_yields_r2.fit()
params = res.params
conf = res.conf_int()
summary_df = (
    pd.DataFrame({
        "Coef.": params,
        "Std.Err.": res.bse,
        "t": res.tvalues,
        "P>|t|": res.pvalues,
        "CI Lower": conf[0],
        "CI Upper": conf[1]
    })
)

print(summary_df.to_latex(float_format="%.3f"))


\begin{tabular}{lrrrrrr}
\toprule
{} &  Coef. &  Std.Err. &       t &  P>|t| &  CI Lower &  CI Upper \\
\midrule
Intercept              &  0.157 &     0.008 &  19.406 &  0.000 &     0.142 &     0.173 \\
whichlag[T.yield\_lag]  & -0.088 &     0.008 & -10.473 &  0.000 &    -0.105 &    -0.072 \\
whichlag[T.yield\_lead] & -0.100 &     0.008 & -12.135 &  0.000 &    -0.117 &    -0.084 \\
avg\_gdp\_z              &  0.008 &     0.004 &   2.147 &  0.032 &     0.001 &     0.016 \\
flag\_sum\_z             & -0.006 &     0.004 &  -1.598 &  0.110 &    -0.013 &     0.001 \\
total\_harvarea\_z       &  0.014 &     0.003 &   4.105 &  0.000 &     0.007 &     0.021 \\
cropland\_fraction\_z    &  0.019 &     0.004 &   5.347 &  0.000 &     0.012 &     0.027 \\
avg\_sm\_z               & -0.008 &     0.004 &  -2.191 &  0.028 &    -0.015 &    -0.001 \\
avg\_tmax\_z             & -0.007 &     0.005 &  -1.305 &  0.192 &    -0.016 &     0.003 \\
cropname Var           &  0.039 &     0.018 &   2.190 &  0.029 

  print(summary_df.to_latex(float_format="%.3f"))
