# Project II: Economic Growth 

This notebook will help you getting started with analyzing the growth dataset, `growth.csv`.

In [546]:
import pandas as pd 
import numpy as np 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
import numpy.linalg as la
from scipy.stats import norm
from sklearn.preprocessing import PolynomialFeatures
import plotly.express as px

%load_ext autoreload
%autoreload 2

# Import the py-file
import assignment_2 as a2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read data 

In [520]:
dat = pd.read_csv('growth.csv')
lbldf = pd.read_csv('labels.csv', index_col='variable')
lbl_all = lbldf.label.to_dict() # as a dictionary
print(f'The data contains {dat.shape[0]} rows (countries) and {dat.shape[1]} columns (variables).')

The data contains 214 rows (countries) and 85 columns (variables).


# Collections of variables

In order to make the analysis simpler, it may be convenient to collect variables in sets that belong together naturally. 

In [521]:
# all available variables
#vv_institutions = ['marketref', 'dem', 'demCGV', 'demBMR', 'demreg', 'legor_fr', 'legor_uk'] 
vv_institutions = ['marketref', 'polity', 'legor_fr', 'legor_uk', 'dem', 'demreg', 'ln_yst'] 
vv_geography = [
        'tropicar','distr', 'distcr', 'distc','suitavg','temp', 'suitgini', 'elevavg', 'elevstd',
        'kgatr', 'precip', 'area', 'abslat', 'cenlong', 'area_ar', 'rough','landlock', 
        'africa',  'asia', 'oceania', 'americas' # 'europe' is the reference
]
vv_geneticdiversity = ['pdiv', 'pdiv_aa', 'pdivhmi', 'pdivhmi_aa']
vv_historical = ['gdp_pc_initial', 'lpop_initial'] # these are often missing: ['pd1', 'pop1']
vv_religion = ['pprotest', 'pcatholic', 'pmuslim']
vv_danger = ['yellow', 'malfal',  'uvdamage']
vv_resources = ['oilres', 'goldm', 'iron', 'silv', 'zinc']
vv_educ = ['ls_bl', 'lh_bl'] # secondary, tertiary: we exclude 'lp_bl' (primary) to avoid rank failure 
vv_econ = ['investment_rate', 'pop_growth']

vv_all = {'institutions': vv_institutions, 
          'geography': vv_geography, 
          'geneticdiversity': vv_geneticdiversity,
          'historical': vv_historical,
          'religion': vv_religion,
          'danger':vv_danger, 
          'resources':vv_resources,
          'econ':vv_econ,
          'educ':vv_educ
         }
list_of_lists = vv_all.values()
vv_all['all'] = [v for sublist in list_of_lists for v in sublist]

In [522]:
# convenient to keep a column of ones in the dataset
dat['constant'] = np.ones((dat.shape[0],))

# Our selection of variables

In [523]:
#Remove all countries with missing values in gdp_growth or lgdp_initial
dat = dat.dropna(subset=['gdp_growth', 'lgdp_initial'])
print(f'The data contains {dat.shape[0]} rows (countries) and {dat.shape[1]} columns (variables).')

The data contains 102 rows (countries) and 86 columns (variables).


In [524]:
#Keep only the variables we want to work with 
vv_selected = ['code', 'gdp_growth', 'lgdp_initial'] + vv_all['institutions'] + vv_all['resources'] + vv_all['educ'] + vv_all['historical'] + vv_all['econ'] + ['constant']
dat_reduced = dat[vv_selected].copy()
print(f'The reduced data contains {dat_reduced.shape[0]} rows (countries) and {dat_reduced.shape[1]} columns (variables).')

The reduced data contains 102 rows (countries) and 22 columns (variables).


In [525]:
#List all remaining variables and their number of observations
for col in dat_reduced.columns:
    n_obs = dat_reduced[col].notnull().sum()
    print(f'{col}: {n_obs} observations')

code: 102 observations
gdp_growth: 102 observations
lgdp_initial: 102 observations
marketref: 84 observations
polity: 86 observations
legor_fr: 102 observations
legor_uk: 102 observations
dem: 90 observations
demreg: 99 observations
ln_yst: 96 observations
oilres: 97 observations
goldm: 97 observations
iron: 97 observations
silv: 97 observations
zinc: 97 observations
ls_bl: 88 observations
lh_bl: 88 observations
gdp_pc_initial: 102 observations
lpop_initial: 102 observations
investment_rate: 99 observations
pop_growth: 102 observations
constant: 102 observations


In [526]:
# avoiding missings
all_vars = vv_selected.copy()
all_vars.remove('code')  # we do not care about 'code' here
I_all = dat[all_vars].notnull().all(1)
print(f'The number of countries where all variables are available is {I_all.sum()}.')
dat.set_index('code')[all_vars].notnull().all(axis=1)
#print list of countries with all variables available
countries_with_all_vars = dat.loc[I_all, 'code'].tolist()
print('Countries with all variables available:', countries_with_all_vars)

The number of countries where all variables are available is 71.
Countries with all variables available: ['ARG', 'AUS', 'AUT', 'BEL', 'BEN', 'BOL', 'BRA', 'CAF', 'CHL', 'CHN', 'CIV', 'CMR', 'COL', 'CRI', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ESP', 'FIN', 'FRA', 'GAB', 'GBR', 'GHA', 'GMB', 'GRC', 'GTM', 'GUY', 'HND', 'IDN', 'IND', 'IRL', 'IRN', 'ITA', 'JAM', 'KEN', 'KOR', 'LKA', 'MAR', 'MEX', 'MLI', 'MMR', 'MRT', 'MYS', 'NER', 'NIC', 'NLD', 'NOR', 'NPL', 'PAK', 'PAN', 'PER', 'PHL', 'PRT', 'PRY', 'RWA', 'SAU', 'SEN', 'SLE', 'SLV', 'SWE', 'TGO', 'THA', 'TUN', 'TUR', 'URY', 'USA', 'ZAF', 'ZMB', 'ZWE']


In [527]:
#Now remove countries with any missing values in the selected variables
dat_reduced = dat_reduced.dropna()
print(f'The reduced data contains {dat_reduced.shape[0]} rows (countries) and {dat_reduced.shape[1]} columns (variables).')

The reduced data contains 71 rows (countries) and 22 columns (variables).


In [533]:
#Remove code column for the analysis
dat_reduced = dat_reduced.drop(columns=['code'])

# Simple OLS

In [534]:
new_vars = dat_reduced.columns.tolist()
print(f'Number of variables in the reduced data: {len(new_vars)}')

#Make index based on the reduced data
I = dat_reduced[new_vars].notnull().all(1)

#make list of all explanatory variables (exclude gdp_growth)
xs = dat_reduced.columns.tolist()
xs.remove('gdp_growth')

# extract data
X = dat_reduced.loc[I, xs].values
y = dat_reduced.loc[I,'gdp_growth'].values.reshape((-1,1)) * 100. #easier to read output when growth is in 100%

# check the rank condition
K = X.shape[1]
assert np.linalg.matrix_rank(X) == X.shape[1], f'X does not have full rank'

Number of variables in the reduced data: 21


In [535]:
# compute the OLS estimator
betas = np.linalg.inv(X.T @ X) @ X.T @ y

In [536]:
#Format nicely
print(f'Mean y = {y.mean(): 5.2f}% growth per year')
pd.DataFrame({'β': betas[:,0]}, index=xs).round(3)

Mean y =  1.56% growth per year


Unnamed: 0,β
lgdp_initial,-0.522
marketref,-0.003
polity,0.006
legor_fr,-0.852
legor_uk,-1.007
dem,0.684
demreg,-0.724
ln_yst,0.309
oilres,0.0
goldm,-0.031


# Prepare data

In [None]:
#Define d, Z and X
d = dat_reduced.loc[I,'lgdp_initial'].values.reshape((-1,1))
#Z is all variables except gdp_growth and lgdp_initial by dropping them from dat_reduced
Z = dat_reduced.loc[I, :].drop(columns=['gdp_growth', 'lgdp_initial', 'constant']).values

# Add polynomial features
# Hint: remember, you don't want the constant
Z = sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False).fit_transform(Z)

# Display number of regressors
print("The number of regressors in Z is {}".format(Z.shape[1]))

The number of regressors in Z is 189


In [542]:
# Construct X 
X = np.column_stack((d,Z))

#Redfine output 
g_LR = dat_reduced.loc[I,'gdp_growth'] * 100 #easier to read output when growth is in %
y = dat_reduced.loc[I, 'lgdp_initial']

# Find N
N = X.shape[0]

In [539]:
#print shape of X, Z, d
print("The shape of X is {}".format(X.shape))
print("The shape of Z is {}".format(Z.shape))
print("The shape of d is {}".format(d.shape))

The shape of X is (71, 190)
The shape of Z is (71, 189)
The shape of d is (71, 1)


In [540]:
stds = np.std(X, axis=0)
nonconstant_cols = stds != 0
X = X[:, nonconstant_cols]
Z = Z[:, np.std(Z, axis=0) != 0]  # hvis du også vil rense Z separat
d = d[:, np.std(d, axis=0) != 0]  # burde dog ikke være nødvendigt for én kolonne

print(f"Removed {(~nonconstant_cols).sum()} constant columns.")
print("Number of columns after cleaning:", X.shape[1])

# Standardize data (function in py-file)
X_stan = a2.standardize(X)
Z_stan = a2.standardize(Z)
y_stan = a2.standardize(y)

print("Number of NaNs after standardization:", np.isnan(X_stan).sum())

Removed 1 constant columns.
Number of columns after cleaning: 189
Number of NaNs after standardization: 0


# Post double using CV

In [543]:
# Post-Double-Lasso with CV (function in py-file)
alpha_PDL, se_PDL, CI_low_PDL, CI_high_PDL = a2.post_double_lasso_analysis(Z_stan, y, X_stan, g_LR, penalty_method='CV')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

The first-stage penalty is =  0.06
The second-stage penalty is =  0.058
The number of non-zero coefficients in the first stage is 20
The number of non-zero coefficients in the second stage is = 17
beta_PDL_hat =  -0.514
SE for beta_PDL_hat  0.205
Confidence interval for alpha =  (-0.915, -0.112)


  model = cd_fast.enet_coordinate_descent(


# Post-Double-Lasso using BCCH-penalty

In [544]:
# Post-Double-Lasso with BCCH (function in py-file)
alpha_PDL, se_PDL, CI_low_PDL, CI_high_PDL = a2.post_double_lasso_analysis(Z_stan, y, X_stan, g_LR, penalty_method='BCCH')

The first-stage penalty is =  1.102
The second-stage penalty is =  1.46
The number of non-zero coefficients in the first stage is 1
The number of non-zero coefficients in the second stage is = 0
beta_PDL_hat =  -0.193
SE for beta_PDL_hat  0.161
Confidence interval for alpha =  (-0.509, 0.123)


# Post Double using BRT

In [548]:
# Post-Double-Lasso with BRT (function in py-file)
alpha_PDL, se_PDL, CI_low_PDL, CI_high_PDL = a2.post_double_lasso_analysis(Z_stan, y, X_stan, g_LR, penalty_method='BRT')

The first-stage penalty is =  0.658
The second-stage penalty is =  0.691
The number of non-zero coefficients in the first stage is 2
The number of non-zero coefficients in the second stage is = 2
beta_PDL_hat =  -0.26
SE for beta_PDL_hat  0.244
Confidence interval for alpha =  (-0.738, 0.219)


# Lasso Path

In [None]:
def plot_lasso_path(penalty_grid, coefs, legends, vlines: dict = None):
    """
    Plots the coefficients as a function of the penalty parameter for Lasso regression.

    Parameters:
    penalty_grid (array-like): The penalty parameter values.
    coefs (array-like): The estimated coefficients for each penalty value.
    legends (list): The labels for each coefficient estimate.
    vlines (dict, optional): A dictionary of vertical lines to add to the plot. The keys are the names of the lines and the values are the penalty values where the lines should be drawn.
    
    """
    # Initiate figure 
    fig, ax = plt.subplots()

    # Plot coefficients as a function of the penalty parameter
    for i in range(len(coefs[0])):
        ax.plot(penalty_grid, np.array(coefs)[:, i], label=legends[i])

    # Set log scale for the x-axis
    ax.set_xscale('log')

    # Add labels
    plt.xlabel('Penalty, $\lambda$')
    plt.ylabel(r'Estimates, $\widehat{\beta}_j(\lambda)$')
    plt.title('Lasso Path')

    # Add legends
    lgd=ax.legend(legends,loc=(1.04,0))
    
    # Add vertical lines
    if vlines is not None:
        for name, penalty in vlines.items():
            ax.axvline(x=penalty, linestyle='--', color='grey')
            plt.text(penalty,-80000,name,rotation=90)

    
    # Display plot
    plt.show()
    plt.close()

  plt.xlabel('Penalty, $\lambda$')


In [None]:
penalty_grid = np.geomspace(0.01, 80000, num=50)
print(penalty_grid)

#Extract BRT penalty from as using the function in the py-file
brt_penalty = a2.BRT(X_stan, y)
print(f'BRT penalty: {brt_penalty}')



[1.00000000e-02 1.38318219e-02 1.91319296e-02 2.64629443e-02
 3.66030732e-02 5.06287188e-02 7.00287420e-02 9.68625086e-02
 1.33978497e-01 1.85316670e-01 2.56326717e-01 3.54546549e-01
 4.90402471e-01 6.78315963e-01 9.38234557e-01 1.29774933e+00
 1.79502375e+00 2.48284488e+00 3.43422681e+00 4.75016136e+00
 6.57033858e+00 9.08797528e+00 1.25703255e+01 1.73870504e+01
 2.40494584e+01 3.32647824e+01 4.60112545e+01 6.36419477e+01
 8.80284084e+01 1.21759327e+02 1.68415332e+02 2.32949087e+02
 3.22211027e+02 4.45676554e+02 6.16451870e+02 8.52665247e+02
 1.17939138e+03 1.63131315e+03 2.25640329e+03 3.12101684e+03
 4.31693490e+03 5.97110746e+03 8.25912948e+03 1.14238808e+04
 1.58013084e+04 2.18560883e+04 3.02309521e+04 4.18149144e+04
 5.78376448e+04 8.00000000e+04]
BRT penalty: 0.6577989601371024
