# Project II: Economic Growth 

This notebook will help you getting started with analyzing the growth dataset, `growth.csv`.

In [14]:
import pandas as pd 
import numpy as np 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits import mplot3d
from sklearn import linear_model
from sklearn import datasets

%matplotlib inline
#plt.style.use('ggplot')

## Read data 

In [15]:
dat = pd.read_csv('growth.csv')
lbldf = pd.read_csv('labels.csv', index_col='variable')
lbl_all = lbldf.label.to_dict() # as a dictionary
print(f'The data contains {dat.shape[0]} rows (countries) and {dat.shape[1]} columns (variables).')

The data contains 214 rows (countries) and 85 columns (variables).


# Descriptive plots

In [None]:
dat.plot.scatter(x='lgdp_initial', y='gdp_growth');    

Text(0, 0.5, 'Annual growth in GDP per capita, 1970-2020')

In [17]:
import seaborn as sns 
sns.scatterplot(x='lgdp_initial', y='gdp_growth', data=dat, hue='malfal');

# Collections of variables

In order to make the analysis simpler, it may be convenient to collect variables in sets that belong together naturally. 

In [18]:
# all available variables
vv_institutions = ['marketref', 'dem', 'demCGV', 'demBMR', 'demreg'] 
vv_geography = [
        'tropicar','distr', 'distcr', 'distc','suitavg','temp', 'suitgini', 'elevavg', 'elevstd',
        'kgatr', 'precip', 'area', 'abslat', 'cenlong', 'area_ar', 'rough','landlock', 
        'africa',  'asia', 'oceania', 'americas' # 'europe' is the reference
]
vv_geneticdiversity = ['pdiv', 'pdiv_aa', 'pdivhmi', 'pdivhmi_aa']
vv_historical = ['pd1000', 'pd1500', 'pop1000', 'pop1500', 'ln_yst'] # these are often missing: ['pd1', 'pop1']
vv_religion = ['pprotest', 'pcatholic', 'pmuslim']
vv_danger = ['yellow', 'malfal',  'uvdamage']
vv_resources = ['oilres', 'goldm', 'iron', 'silv', 'zinc']
vv_educ = ['ls_bl', 'lh_bl'] # secondary, tertiary: we exclude 'lp_bl' (primary) to avoid rank failure 

vv_all = {'institutions': vv_institutions, 
          'geography': vv_geography, 
          'geneticdiversity': vv_geneticdiversity,
          'historical': vv_historical,
          'religion': vv_religion,
          'danger':vv_danger, 
          'resources':vv_resources
         }
list_of_lists = vv_all.values()
vv_all['all'] = [v for sublist in list_of_lists for v in sublist]


In [19]:
# convenient to keep a column of ones in the dataset
dat['constant'] = np.ones((dat.shape[0],))

# Simple OLS

In [20]:
# 1. avoiding missings 
I = dat[['gdp_growth', 'lgdp_initial']].notnull().all(axis=1)

# 2. extract dataset 
y = dat.loc[I, 'gdp_growth'].values.reshape((-1,1)) * 100.0
X = dat.loc[I, ['constant','lgdp_initial']].values

# 3. run OLS
betahat = np.linalg.inv(X.T @ X) @ X.T @ y
print(betahat)

[[ 2.24792211]
 [-0.08713416]]


# Adding more controls

In [21]:
vs = vv_all['geography'] + vv_all['religion']
xs = ['lgdp_initial', 'pop_growth', 'investment_rate'] + vs

# avoiding missings
all_vars = ['gdp_growth'] + xs
I = dat[all_vars].notnull().all(1)

# extract data
X = dat.loc[I, xs].values
y = dat.loc[I,'gdp_growth'].values.reshape((-1,1)) * 100. #easier to read output when growth is in 100%

# add const. (unless this breaks the rank condition)
oo = np.ones((I.sum(),1))
X = np.hstack([X, oo])
xs.append('constant') # we put it in as the last element

# check the rank condition
K = X.shape[1]
assert np.linalg.matrix_rank(X) == X.shape[1], f'X does not have full rank'

# compute the OLS estimator
betas = np.linalg.inv(X.T @ X) @ X.T @ y

In [24]:
# format nicely
print(f'Mean y = {y.mean(): 5.2f}% growth per year')
pd.DataFrame({'β': betas[:,0]}, index=xs).round(3)


Mean y =  1.52% growth per year


Unnamed: 0,β
lgdp_initial,-1.118
pop_growth,6.83
investment_rate,0.068
tropicar,-0.411
distr,-0.0
distcr,0.001
distc,-0.003
suitavg,-1.063
temp,-0.204
suitgini,-0.771


In [47]:
# count missing values in each variable
for x in xs:
    missings = dat[x].isnull().sum()
    print(f'{x:15s} has {missings:3d} missings')

# make balanced panel data
dat_copy = dat.copy()
dat_copy = dat_copy[all_vars].dropna().reset_index(drop=True)
dat_copy['constant'] = np.ones((dat_copy.shape[0],))
print(f'Balanced panel data has {dat_copy.shape[0]} rows (countries) and {dat_copy.shape[1]} columns (variables).')


lgdp_initial    has 105 missings
pop_growth      has  16 missings
investment_rate has  35 missings
tropicar        has  55 missings
distr           has  55 missings
distcr          has  55 missings
distc           has  55 missings
suitavg         has  59 missings
temp            has  30 missings
suitgini        has  54 missings
elevavg         has  30 missings
elevstd         has  53 missings
kgatr           has  54 missings
precip          has  30 missings
area            has   6 missings
abslat          has   9 missings
cenlong         has   6 missings
area_ar         has  18 missings
rough           has  30 missings
landlock        has  51 missings
africa          has   6 missings
asia            has   6 missings
oceania         has   6 missings
americas        has   6 missings
pprotest        has  13 missings
pcatholic       has  10 missings
pmuslim         has  10 missings
constant        has   0 missings
constant        has   0 missings
Balanced panel data has 89 rows (countries)

In [48]:
for x in xs:
    missings = dat_copy[x].isnull().sum()
    print(f'{x:15s} has {missings:3d} missings')

lgdp_initial    has   0 missings
pop_growth      has   0 missings
investment_rate has   0 missings
tropicar        has   0 missings
distr           has   0 missings
distcr          has   0 missings
distc           has   0 missings
suitavg         has   0 missings
temp            has   0 missings
suitgini        has   0 missings
elevavg         has   0 missings
elevstd         has   0 missings
kgatr           has   0 missings
precip          has   0 missings
area            has   0 missings
abslat          has   0 missings
cenlong         has   0 missings
area_ar         has   0 missings
rough           has   0 missings
landlock        has   0 missings
africa          has   0 missings
asia            has   0 missings
oceania         has   0 missings
americas        has   0 missings
pprotest        has   0 missings
pcatholic       has   0 missings
pmuslim         has   0 missings
constant        has   0 missings
constant        has   0 missings
