In [1]:
from numba import njit, jit

import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from numpy import random
from numpy import linalg
from scipy import sparse
from scipy import stats
import statsmodels.api as sm
import matplotlib
import itertools as it
from matplotlib import rc


import pandas as pd
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.preamble'] = [
    r'\usepackage{amssymb}',
    r'\usepackage{amsmath}',
    r'\usepackage{xcolor}',
    r'\renewcommand*\familydefault{\sfdefault}']
matplotlib.rcParams['pgf.texsystem'] = 'pdflatex'
matplotlib.rcParams['pgf.preamble']  = [
    r'\usepackage[utf8x]{inputenc}',
    r'\usepackage{amssymb}',
    r'\usepackage[T1]{fontenc}',
    r'\usepackage{amsmath}',
    r'\usepackage{sansmath}']

from IPython.display import set_matplotlib_formats
%matplotlib inline
set_matplotlib_formats('svg')

import warnings

warnings.filterwarnings('ignore')

# Exercise 2


In [2]:
data = pd.read_csv("ProblemSet1Exercise2data.csv")


### a) 
The regression result is showing that the contribution of current GDP on GDP growth is negative while life expectancy and primary school enrollment rate have a positive effect on long-run growth. 

In [3]:
X=data.loc[:, data.columns.isin(['GDPSH60','LIFEE060','P60'])]
X = sm.add_constant(X)
mod = sm.OLS(data.gamma, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  gamma   R-squared:                       0.288
Model:                            OLS   Adj. R-squared:                  0.221
Method:                 Least Squares   F-statistic:                     4.313
Date:                Fri, 24 Apr 2020   Prob (F-statistic):             0.0116
Time:                        15:48:43   Log-Likelihood:                 106.24
No. Observations:                  36   AIC:                            -204.5
Df Residuals:                      32   BIC:                            -198.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0615      0.024      2.534      0.0

### b)
When we use all covariates in the dataset, we obtain a perfect fit with R-squared = 1. GDPSH60, LIFEE060, and P60 flip sign compared to the results found in (a). 

In [4]:
X=data.loc[:, ~data.columns.isin(['code','country','gamma'])]
X = sm.add_constant(X)
mod = sm.OLS(data.gamma, X)
res = mod.fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:                  gamma   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Fri, 24 Apr 2020   Prob (F-statistic):                nan
Time:                        15:48:43   Log-Likelihood:                 1076.4
No. Observations:                  36   AIC:                            -2081.
Df Residuals:                       0   BIC:                            -2024.
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0138        inf          0        n

### c)
We find that YrsOpen and PRIEXP70 are the two covariates that give the highest R-squared when they are used with the covariates GDPSH60, LIFEE060, and P60. The signs of baseline model covariates are the same as what we obtain in part (a). The signs of coefficients of newly added covariates show that the length of trade openness affects long-run growth positively while share of primary exports is the opposite. The signs are plausible as we know being open to trade is growth improving but exporting raw materials more suffers from declining terms of trade which can hinder the growth. 

In [5]:
comb = list(it.combinations(data.columns[~data.columns.isin(['code','country','gamma','GDPSH60','LIFEE060','P60'])],2))
rsq=np.zeros(shape=(np.shape(np.array(comb))[0],1))
count=0
for i in it.combinations(data.columns[~data.columns.isin(['code','country','gamma','GDPSH60','LIFEE060','P60'])],2):
    X=data.loc[:, data.columns.isin(i)]
    X= np.column_stack((X,data.GDPSH60,data.LIFEE060,data.P60))
    X = sm.add_constant(X)
    modb = sm.OLS(data.gamma, X)
    resb = modb.fit()
    rsq[count] = resb.rsquared
    count = count +1
    
maxrsq = np.argmax(rsq)
comb[maxrsq]

('YrsOpen', 'PRIEXP70')

In [6]:
X=data.loc[:, ( data.columns.isin(comb[maxrsq])) | ( data.columns.isin(['GDPSH60','LIFEE060','P60']))]
X = sm.add_constant(X)
mod = sm.OLS(data.gamma, X)
res = mod.fit()
print(res.summary())




                            OLS Regression Results                            
Dep. Variable:                  gamma   R-squared:                       0.703
Model:                            OLS   Adj. R-squared:                  0.654
Method:                 Least Squares   F-statistic:                     14.21
Date:                Fri, 24 Apr 2020   Prob (F-statistic):           3.70e-07
Time:                        15:48:47   Log-Likelihood:                 121.99
No. Observations:                  36   AIC:                            -232.0
Df Residuals:                      30   BIC:                            -222.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1415      0.022      6.499      0.0

In [7]:
np.size(rsq)

1653

### d)
In part (c) we run $\frac{58!}{(58-2)!2!}=1653$ different regressions. If we want to choose 5 more additional variables instead of two, then we need $\frac{58!}{(58-5)!5!}=4582116$  and for 6 additional variables we need $\frac{58!}{(58-6)!6!}=40475358$ 

# Exercise 3

In [8]:
np.random.seed(10)
def find_smallest_eig(rho,p,N,S):
    covar_mat = np.ones((p,p)) *  rho
    np.fill_diagonal(covar_mat,1)
    smallest_eig = np.zeros(S)
    for i in range(S):
        X = np.random.multivariate_normal(np.zeros(p),covar_mat,N) 
        val, vec  = np.linalg.eig(np.matmul(np.transpose(X),X))
        smallest_eig[i] = np.amin(val)
    return np.mean(smallest_eig)

N = np.array([100,200,500,1000])   
rho = np.array([0,0.5,0.9])
p = 90
S = 10

### a-b)
Table below shows that when $\rho$ increases the smallest eigenvalue decreases which means if collinearity between variables are high then we get higher variance in our estimate and also inverting the matrix may create some problems. Another feature we can observe is that when covariate number is fixed, then increasing the sample helps to get higher eigenvalues, so the smaller variance. 

In [9]:
eig_mat = np.zeros((np.size(N),np.size(rho)))

for i in range(np.size(N)):
    for j in range(np.size(rho)):
        eig_mat[i,j] = find_smallest_eig(rho[j],p,N[i],S)
        
eig_table = pd.DataFrame(eig_mat)
eig_table.columns = ['$\rho = 0$','$\rho = 0.5$','$\rho = 0.9$']
eig_table = eig_table.rename(index={0: "$N = 100, p=90$", 1: "$N = 200, p=90$", 2: "$N = 500, p=90$", 3: "$N = 1000, p=90$"})

eig_table

Unnamed: 0,$\rho = 0$,$\rho = 0.5$,$\rho = 0.9$
"$N = 100, p=90$",0.36386,0.183166,0.038164
"$N = 200, p=90$",23.659328,11.745444,2.450264
"$N = 500, p=90$",175.541705,86.079697,17.126195
"$N = 1000, p=90$",504.993682,250.633126,50.826839


### c)
When the number of covariates used in the analysis depends on the sample size we see that the increase in eigenvalues with sample size is slower. This shows that increasing the number of covariates acts as an opposing force on eigenvalues. 

In [10]:
eig_mat2 = np.zeros((np.size(N),np.size(rho)))

for i in range(np.size(N)):
    for j in range(np.size(rho)):
        eig_mat2[i,j] = find_smallest_eig(rho[j],int(0.9*N[i]),N[i],S)
    
eig_table2 = pd.DataFrame(eig_mat2)
eig_table2.columns = ['$\rho = 0$','$\rho = 0.5$','$\rho = 0.9$']
eig_table2 = eig_table2.rename(index={0: "$N = 100, p=90$", 1: "$N = 200, p=180$", 2: "$N = 500, p=450$", 3: "$N = 1000, p=900$"})

eig_table2

Unnamed: 0,$\rho = 0$,$\rho = 0.5$,$\rho = 0.9$
"$N = 100, p=90$",0.406568,0.233357,0.040771
"$N = 200, p=180$",0.730736,0.315711,0.067149
"$N = 500, p=450$",1.640309,0.801294,0.145375
"$N = 1000, p=900$",2.809834,1.362788,0.28146


 ### d)
 When the number of variables used in regression grows not linearly but logarithmic, we see less effect of increasing variable numbers on eigenvalues.  

In [11]:
eig_mat3 = np.zeros((np.size(N),np.size(rho)))

for i in range(np.size(N)):
    for j in range(np.size(rho)):
        eig_mat3[i,j] = find_smallest_eig(rho[j],int(np.floor(19.55*np.log(N[i]))),N[i],S)
        
eig_table3 = pd.DataFrame(eig_mat3)
eig_table3.columns = ['$\rho = 0$','$\rho = 0.5$','$\rho = 0.9$']
eig_table3 = eig_table3.rename(index={0: "$N = 100, p=90$", 1: "$N = 200, p=103$", 2: "$N = 500, p=121$", 3: "$N = 1000, p=135$"})

eig_table3      

Unnamed: 0,$\rho = 0$,$\rho = 0.5$,$\rho = 0.9$
"$N = 100, p=90$",0.408932,0.233821,0.040689
"$N = 200, p=103$",17.611569,9.234064,1.729785
"$N = 500, p=121$",135.331114,67.982816,13.407499
"$N = 1000, p=135$",414.023602,206.798231,41.227573
