# Panel data for ENSO and country-level economics
#### Christopher Callahan
#### Christopher.W.Callahan.GR@dartmouth.edu

#### Mechanics
Dependencies

In [1]:
import xarray as xr
import numpy as np
import sys
import os
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.colors as colors
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import seaborn as sns
from rasterio import features
from affine import Affine
import geopandas as gp
import descartes
import cartopy as cart
import cartopy.crs as ccrs
from cartopy.feature import ShapelyFeature
from scipy import signal, stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

Data locations

In [2]:
loc_gdp = "../Data/WDI/"
loc_shp = "../Data/ProcessedCountryShapefile/"
loc_regions = "../Data/Regions/"
loc_pwt = "../Data/PWT/"
loc_precip = "../Data/CountryPrecip/"
loc_temp = "../Data/CountryTemp/"
loc_enso = "../Data/ENSO_Indices/"
loc_teleconnections = "../Data/Teleconnections/"
loc_income_class = "../Data/Income_Classes/"
loc_out = "../Data/Panel/"


Shapefile

In [3]:
shp = gp.read_file(loc_shp)
iso_shp = shp.ISO3.values

Years

In [4]:
y1_in = 1950
y2_in = 2019
y1 = 1960
y2 = 2019

Warnings

In [5]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning,message="'base' in .resample()")

#### Analysis

Read ENSO indices

In [7]:
enso_in = xr.open_dataset(loc_enso+"obs_ENSO_indices_monthly_"+str(y1)+"-"+str(y2)+".nc")
E = enso_in.e_index
C = enso_in.c_index
nino3 = enso_in.nino3
nino34 = enso_in.nino34

Convert ENSO indices to DJF annual

In [8]:
def monthly_to_yearly_mean(x):

        # calculate annual mean from monthly data
        # after weighting for the difference in month length
        # x must be data-array with time coord
        # xarray must be installed

        # x_yr = x.resample(time="YS").mean(dim="time") is wrong
        # because it doesn't weight for the # of days in each month

        days_in_mon = x.time.dt.days_in_month
        wgts = days_in_mon.groupby("time.year")/days_in_mon.groupby("time.year").sum()
        ones = xr.where(x.isnull(),0.0,1.0)
        x_sum = (x*wgts).resample(time="YS").sum(dim="time")
        ones_out = (ones*wgts).resample(time="YS").sum(dim="time")
        return(x_sum/ones_out)

In [9]:
enso_time_ind = (E.time.dt.year>=y1)&(E.time.dt.year<=y2)

In [10]:
Eshift = E[enso_time_ind].shift(time=1)
Eshift.coords["time"] = pd.date_range(start=str(y1)+"-01-01",end=str(y2)+"-12-31",freq="MS")
E_yr = monthly_to_yearly_mean(Eshift[Eshift.time.dt.month<=3])

Cshift = C[enso_time_ind].shift(time=1)
Cshift.coords["time"] = pd.date_range(start=str(y1)+"-01-01",end=str(y2)+"-12-31",freq="MS")
C_yr = monthly_to_yearly_mean(Cshift[Cshift.time.dt.month<=3])

nino3shift = nino3[enso_time_ind].shift(time=1)
nino3shift.coords["time"] = pd.date_range(start=str(y1)+"-01-01",end=str(y2)+"-12-31",freq="MS")
nino3_yr = monthly_to_yearly_mean(nino3shift[nino3shift.time.dt.month<=3])

nino34shift = nino34[enso_time_ind].shift(time=1)
nino34shift.coords["time"] = pd.date_range(start=str(y1)+"-01-01",end=str(y2)+"-12-31",freq="MS")
nino34_yr = monthly_to_yearly_mean(nino34shift[nino34shift.time.dt.month<=3])

Read in country temp and precip

In [20]:
y1_temp = 1900
y2_temp = 2019
# ObsEnsemble
country_temp_monthly = xr.open_dataarray(loc_temp+"BerkeleyEarth_country_temp_monthly_"+str(y1_temp)+"-"+str(y2_temp)+".nc")
country_temp_monthly_std = xr.open_dataarray(loc_temp+"BerkeleyEarth_country_temp_monthly_std_"+str(y1_temp)+"-"+str(y2_temp)+".nc")

y1_precip = 1900
y2_precip = 2019
country_precip_monthly = xr.open_dataarray(loc_precip+"GPCC_country_precip_monthly_"+str(y1_precip)+"-"+str(y2_precip)+".nc")
country_precip_monthly_std = xr.open_dataarray(loc_precip+"GPCC_country_precip_monthly_std_"+str(y1_precip)+"-"+str(y2_precip)+".nc")

In [21]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning,message="'base' in .resample")

country_temp = monthly_to_yearly_mean(country_temp_monthly.loc[:,str(y1)+"-01-01":str(y2)+"-12-31"])
country_precip = monthly_to_yearly_mean(country_precip_monthly.loc[:,str(y1)+"-01-01":str(y2)+"-12-31"])

Now read in GDP

In [22]:
wdi_panel = pd.read_csv(loc_gdp+"API_NY.GDP.PCAP.KD_DS2_en_csv_v2_3630804.csv").drop(columns=["Country Name","Indicator Name","Indicator Code"])
wdi_panel_long1 = pd.melt(wdi_panel,id_vars="Country Code",var_name="year",value_name="gdppc")
wdi_panel_long1["year"] = wdi_panel_long1.year.astype(int)
wdi_panel_long = wdi_panel_long1.rename(columns={'Country Code':"iso"}).loc[(wdi_panel_long1.year>=y1)&(wdi_panel_long1.year<=y2),:]

Build panel

In [23]:
gpc_iso = np.unique(wdi_panel_long.iso.values)
nc = len(gpc_iso)
years = np.arange(y1,y2+1,1)
years_repeat = np.tile(years,nc).flatten()
iso_repeat = np.repeat(gpc_iso,len(years))

Create

In [24]:
panel = pd.DataFrame(np.transpose([years_repeat,iso_repeat]),columns=["year","iso"])
panel["year"] = panel["year"].values.astype(int)

Add ENSO indices

In [26]:
E_yr_panel = xr.DataArray(E_yr.values,coords=[E_yr.time.dt.year.values],dims=["year"])
C_yr_panel = xr.DataArray(C_yr.values,coords=[E_yr.time.dt.year.values],dims=["year"])
E_yr_panel.name = "e"
C_yr_panel.name = "c"
E_yr_df = E_yr_panel.to_dataframe().reset_index()
C_yr_df = C_yr_panel.to_dataframe().reset_index()
nino3_yr_panel = xr.DataArray(nino3_yr.values,coords=[nino3_yr.time.dt.year.values],dims=["year"])
nino3_yr_panel.name = "nino3"
nino3_yr_df = nino3_yr_panel.to_dataframe().reset_index()
nino34_yr_panel = xr.DataArray(nino34_yr.values,coords=[nino34_yr.time.dt.year.values],dims=["year"])
nino34_yr_panel.name = "nino34"
nino34_yr_df = nino34_yr_panel.to_dataframe().reset_index()

panel = pd.merge(panel,E_yr_df,on=["year"],how="left")
panel = pd.merge(panel,C_yr_df,on=["year"],how="left")
panel = pd.merge(panel,nino3_yr_df,on=["year"],how="left")
panel = pd.merge(panel,nino34_yr_df,on=["year"],how="left")

Add teleconnections

In [27]:
y1_tc = 1960
y2_tc = 2019

In [28]:
tc_ds = xr.open_dataset(loc_teleconnections+"ENSO_observed_teleconnections_DJF_"+str(y1_tc)+"-"+str(y2_tc)+".nc")

for ind in ["e","c"]:
    
    # precipitation with sign preserved, not absolute value
    p_corr2 = tc_ds.data_vars["p_corr2_"+ind]
    p_corr2.name = "p_corr2_"+ind
    panel = pd.merge(panel,p_corr2.to_dataframe().reset_index(),
                     on=["iso"],how="left")

    # combined correlation
    tp_corr = tc_ds.data_vars["combined_corr_"+ind]
    tp_corr.name = "t_p_corr_"+ind
    tp_corr_df = tp_corr.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_corr_df,on=["iso"],how="left")
    
    # combined regression coefficient
    tp_reg = tc_ds.data_vars["combined_reg_"+ind]
    tp_reg.name = "t_p_reg_"+ind
    tp_reg_df = tp_reg.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_reg_df,on=["iso"],how="left")
    
    # combined correlation coefficient running
    tp_corr_running = tc_ds.data_vars["combined_corr_"+ind+"_running"]
    tp_corr_running.name = "t_p_corr_running_"+ind
    tp_corr_running_df = tp_corr_running.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_corr_running_df,on=["iso"],how="left")
    
    # temp running correlation coefficient
    t_corr_running = tc_ds.data_vars["t_corr_running_"+ind]
    t_corr_running.name = "t_corr_running_"+ind
    t_corr_running_df = t_corr_running.to_dataframe().reset_index()
    panel = pd.merge(panel,t_corr_running_df,on=["iso"],how="left")
    
    # precip running correlation coefficient
    p_corr_running = tc_ds.data_vars["p_corr_running_"+ind]
    p_corr_running.name = "p_corr_running_"+ind
    p_corr_running_df = p_corr_running.to_dataframe().reset_index()
    panel = pd.merge(panel,p_corr_running_df,on=["iso"],how="left")
    
    # combined regression coefficient running
    tp_reg_running = tc_ds.data_vars["combined_reg_"+ind+"_running"]
    tp_reg_running.name = "t_p_reg_running_"+ind
    tp_reg_running_df = tp_reg_running.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_reg_running_df,on=["iso"],how="left")
    
    # cumulative running correlation coefficient
    tp_corr_running_sum = tc_ds.data_vars["combined_corr_"+ind+"_sum"]
    tp_corr_running_sum.name = "t_p_corr_sum_"+ind
    tp_corr_running_sum_df = tp_corr_running_sum.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_corr_running_sum_df,on=["iso"],how="left")
    
    # cumulative running reg coefficient
    tp_reg_running_sum = tc_ds.data_vars["combined_reg_"+ind+"_sum"]
    tp_reg_running_sum.name = "t_p_reg_sum_"+ind
    tp_reg_running_sum_df = tp_reg_running_sum.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_reg_running_sum_df,on=["iso"],how="left")
    
    # temp cumulative running correlation coefficient
    t_corr_running_sum = tc_ds.data_vars["t_corr_"+ind+"_sum"]
    t_corr_running_sum.name = "t_corr_sum_"+ind
    t_corr_running_sum_df = t_corr_running_sum.to_dataframe().reset_index()
    panel = pd.merge(panel,t_corr_running_sum_df,on=["iso"],how="left")
    
    # precip cumulative running correlation coefficient
    p_corr_running_sum = tc_ds.data_vars["p_corr_"+ind+"_sum"]
    p_corr_running_sum.name = "p_corr_sum_"+ind
    p_corr_running_sum_df = p_corr_running_sum.to_dataframe().reset_index()
    panel = pd.merge(panel,p_corr_running_sum_df,on=["iso"],how="left")
    
    # only statistically significant sum
    tp_corr_sig_sum = tc_ds.data_vars["combined_corr_"+ind+"_sum_sig"]
    tp_corr_sig_sum.name = "t_p_corr_sum_sig_"+ind
    tp_corr_sig_sum_df = tp_corr_sig_sum.to_dataframe().reset_index()
    panel = pd.merge(panel,tp_corr_sig_sum_df,on=["iso"],how="left")

Add temp and precip

In [29]:
country_temp_panel = xr.DataArray(country_temp.values,
                                coords=[country_temp.iso,
                                        country_temp.time.dt.year.values],
                                 dims=["iso","year"])
country_precip_panel = xr.DataArray(country_precip.values,
                                coords=[country_precip.iso,
                                        country_precip.time.dt.year.values],
                                 dims=["iso","year"])
country_temp_panel.name = "t"
country_precip_panel.name = "p"
country_temp_df = country_temp_panel.to_dataframe().reset_index()
country_precip_df = country_precip_panel.to_dataframe().reset_index()

panel = pd.merge(panel,country_temp_df,on=["iso","year"],how="left")
panel = pd.merge(panel,country_precip_df,on=["iso","year"],how="left")

Add GPC from World Bank

In [30]:
panel = pd.merge(panel,wdi_panel_long.rename(columns={"gdppc":"gpc"}),on=["iso","year"],how="left")

Read in Penn World Tables and add to panel

In [31]:
pwt_in = pd.read_csv(loc_pwt+"pwt10-0.csv",engine="python")
pwt_iso = pwt_in.countrycode.values
pwt_yr = pwt_in.year.values
pwt_in["population"] = pwt_in["pop"]*1e6 # originally in millions
pwt_in["gpc"] = (pwt_in["rgdpna"]*1e6)/pwt_in["population"]
pwt_in["gpc_ppp"] = (pwt_in["rgdpo"]*1e6)/pwt_in["population"]
# gdp = rgdpna
# pop = pop
# human capital = hc 
# capital stock = rkna

# real consumption in millions = ccon
# real domestic absorption in millions = cda

# more info on PWT (specifically the capital data):
# https://www.rug.nl/ggdc/docs/pwt100-user-guide-to-data-files.pdf
# https://www.rug.nl/ggdc/docs/pwt91_whatsnew.pdf
# https://www.rug.nl/ggdc/docs/pwt91_capitalservices_ipmrevision.pdf
# https://www.rug.nl/ggdc/docs/pwt91_user_guide_to_data_files.pdf
# also ag income

In [32]:
pwt_for_panel = pwt_in.loc[:,["year","countrycode","population","gpc","rgdpna","rkna","rtfpna",
                             "hc","labsh","delta","ctfp","emp","rnna","gpc_ppp"]]
pwt_for_panel = pwt_for_panel.rename(columns={"countrycode":"iso","population":"pop_pwt",
                                             "gpc":"gpc_pwt","rgdpna":"gdp_pwt",
                                             "rkna":"capital","rtfpna":"tfp"})
pwt_for_panel["gdp_pwt"] = pwt_for_panel["gdp_pwt"]*1e6
pwt_for_panel["cspercap"] = (pwt_for_panel["rnna"]*1e6)/pwt_for_panel["pop_pwt"]
#pwt_for_panel["capitalstock"] = pwt_for_panel["capitalstock"]*1e6

In [33]:
panel = pd.merge(panel,pwt_for_panel,on=["iso","year"],how="left")

Fixed effects (dummy variables)

In [34]:
countries = panel.loc[:,"iso"].values
countries_sorted = list(sorted(set(countries)))
years = panel.loc[:,"year"].values.astype(int)

zrs_ctry = np.zeros(len(years))
for i in np.arange(0,len(countries_sorted),1):
    zrs_lin = np.zeros(len(years))
    zrs_quad = np.zeros(len(years))
    indices = countries == countries_sorted[i]
    y_lin = years[indices] - y1
    y_quad = y_lin**2
    zrs_lin[indices] = y_lin
    zrs_quad[indices] = y_quad
    
    indices_num = indices.astype(int)
    zrs_ctry[indices] = [i+1] * len(indices_num[indices_num == 1])
    
    panel.loc[:,"yi_linear_"+str(i)] = zrs_lin
    panel.loc[:,"yi_quadratic_"+str(i)] = zrs_quad
    
panel.loc[:,"countrynum"] = zrs_ctry

In [35]:
panel["countrynum"] = panel["countrynum"].values.astype(int)

In [36]:
panel.loc[:,"lngpc"] = np.log(panel.loc[:,"gpc"])
panel.loc[:,"lngpc_pwt"] = np.log(panel.loc[:,"gpc_pwt"])
panel.loc[:,"lncs"] = np.log(panel.loc[:,"capital"])
panel.loc[:,"lntfp"] = np.log(panel.loc[:,"tfp"])

Growth in various quantities

In [37]:
growth = np.zeros(len(years))
for i in np.arange(0,len(countries_sorted),1):
    indices = countries == countries_sorted[i]
    gpc_ctry = panel.loc[indices,"lngpc"].values
    diff = np.diff(gpc_ctry)
    diffnan = np.insert(diff,0,np.nan)
    indices_num = indices.astype(int)
    growth[indices] = diffnan
    
panel.loc[:,"growth"] = growth

growth_pwt = np.zeros(len(years))
for i in np.arange(0,len(countries_sorted),1):
    indices = countries == countries_sorted[i]
    gpc_pwt_ctry = panel.loc[indices,"lngpc_pwt"].values
    diff = np.diff(gpc_pwt_ctry)
    diffnan = np.insert(diff,0,np.nan)
    indices_num = indices.astype(int)
    growth_pwt[indices] = diffnan
    
panel.loc[:,"growth_pwt"] = growth_pwt

Additional fractional growth quantities

In [38]:
def add_growth(panel_var_name,final_name,panel):
    countries = panel.loc[:,"iso"].values
    countries_sorted = list(sorted(set(countries)))
    years = panel.loc[:,"year"].values.astype(int)
    
    growth_vals = np.zeros(len(years))
    for i in np.arange(0,len(countries_sorted),1):
        indices = countries == countries_sorted[i]
        ctry_vals = panel.loc[indices,panel_var_name].values
        diff = np.diff(ctry_vals)
        frac_diff = diff/ctry_vals[:-1]
        frac_diff_nan = np.insert(frac_diff,0,np.nan)
        growth_vals[indices] = frac_diff_nan
    panel.loc[:,final_name] = growth_vals
    return(panel)

In [39]:
panel = add_growth("gpc_pwt","gr_pwt_frac",panel)
panel = add_growth("pop_pwt","gr_pop_frac",panel)
panel = add_growth("capital","gr_cs_frac",panel)
panel = add_growth("tfp","gr_tfp_frac",panel)
panel = add_growth("cspercap","gr_cspc_frac",panel)

Lat and lon for spatial clustering if desired

In [40]:
panel["lat"] = np.full(len(years),np.nan)
panel["lon"] = np.full(len(years),np.nan)
for i in np.arange(0,len(years_repeat),1):
    code = iso_repeat[i]
    if code in shp.ISO3.values:
        panel.loc[panel.iso.values==code,"lat"] = shp.loc[shp["ISO3"].values==code,"LAT"].values[0]
        panel.loc[panel.iso.values==code,"lon"] = shp.loc[shp["ISO3"].values==code,"LON"].values[0]

Add regions and year-region combinations

In [41]:
regions = pd.read_csv(loc_regions+"WPP2019_Regions_Processed.csv")

In [42]:
panel["region"] = np.full(len(years),np.nan)
for i in np.arange(0,len(years_repeat),1):
    code = iso_repeat[i]
    if code in regions.ISO3.values:
        reg = regions.loc[regions.ISO3.values==code,"RegionCode"].values[0]
        panel.loc[(panel.year.values==years_repeat[i])&(panel.iso.values==code),"region"] = reg

In [43]:
panel["yr_reg"] = panel.year.values*1000 + panel.region.values

Add world bank low-income/high-income classifications

In [44]:
income_class = pd.read_csv(loc_income_class+"incomeclasses.csv",engine="python")

In [45]:
panel["income_group"] = np.full(len(years),np.nan)
iso_uq = np.unique(panel.iso.values)
for i in np.arange(0,len(iso_uq),1):
    code = iso_uq[i]
    if code in income_class.Code.values:
        iso_class = income_class.loc[income_class.Code.values==code,"Income group"].values[0]
        if iso_class in ["Low income","Lower middle income"]:
            panel.loc[(panel.iso.values==code),"income_group"] = np.repeat("low",np.sum(panel.iso.values==code))
        elif iso_class in ["High income","Upper middle income"]:
            panel.loc[(panel.iso.values==code),"income_group"] = np.repeat("high",np.sum(panel.iso.values==code))

In [46]:
panel.to_csv(loc_out+"ENSO_Growth_Panel_"+str(y1)+"-"+str(y2)+".csv")