In [1]:
import pandas as pd
import numpy as np
from linearmodels.panel import PanelOLS
from linearmodels.panel import compare

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('growthdata_public.csv')
print(f'shape: {df.shape}')
print(f'countries: {df['country'].nunique()}')
print(f'periods: {sorted(df['period'].unique())}')


df = df.sort_values(['country', 'period'])
df = df.set_index(['country', 'period'])

shape: (2321, 113)
countries: 211
periods: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]


# generate required variables

In [3]:
#lagged dependent variable 
df['lagdependent'] = df.groupby('country')['lrgdpna_pc'].shift(1)

#first diff of log terms of trade 
df['dltot'] = df.groupby('country')['ltot'].diff()

#change in urban pop share 
df['durbanpop'] = df.groupby('country')['urbanpop'].diff()

#quared export diversification index
df['lEDI_ipol_sq'] = df['lEDI_ipol'] ** 2

#change in gini coeff 
df['dgini'] = df.groupby('country')['gini_mkt'].diff()

# rename for clarity
df['infra_index'] = df['infrastructure_index']

# define variables for each model

In [4]:
small_vars = [
    'lagdependent', 
    'linflation_na', 
    'lrer', 
    'ltraderesid', 
    'infra_index', 
    'dum_fincrisis', 
    'sd_temperature', 
    'dltot', 
    'lkg', 
    'sd_growth', 
    'durbanpop'
]

medium_vars = small_vars + [
    'lcredit',
    'lFDIstock_ipol',
    'lEDI_ipol',
    'lEDI_ipol_sq',
    'actotal'
]

large_vars = medium_vars + [
    'lhc', 
    'dgini'
]

# estimation function

In [5]:
def run_model(data, dep_var, indep_vars, model_name):
    ''' 
    run fixed effects panel regression with period dummies and clustered SEs 
    ''' 
    #select cols and drop na
    cols = [dep_var] + indep_vars
    subset = data[cols].dropna().copy()

    #create period dummies from index
    subset = subset.reset_index()
    period_dummies = pd.get_dummies(subset['period'], prefix='period', drop_first=True)
    subset = pd.concat([subset, period_dummies], axis=1)
    subset = subset.set_index(['country', 'period'])

    period_cols = [c for c in subset.columns if c.startswith('period_')]

    #prep y and X
    y = subset[dep_var]
    X = subset[indep_vars + period_cols].copy()
    X['const'] = 1

    #estimate with entity fixed fx and clustered SEs
    model = PanelOLS(y, X, entity_effects=True, drop_absorbed=True)
    res = model.fit(cov_type='clustered', cluster_entity=True)

    print(f'{"="*70}')
    print(f'{model_name}')
    print(f'{"="*70}')
    print(f'observations: {int(res.nobs)}')
    print(f'countries: {int(res.entity_info["total"])}')
    print(f'r2 (within): {res.rsquared_within:.3f}')
    print(f'\ncoefficients:')
    print(f'{"-"*70}')

    for var in indep_vars + ['const']:
        if var in res.params.index:
            coef = res.params[var]
            se = res.std_errors[var]
            pval = res.pvalues[var]

            stars = '' 
            if pval < 0.01:
                stars = '***' 
            elif pval < 0.05:
                stars = '**' 
            elif pval < 0.1:
                stars = '*' 
            print(f'{var:20s}: {coef:10.4f}{stars:4s} ({se:.4f})')
    return res 

# run all three models (for task c)

In [6]:
results_small = run_model(df, 'lrgdpna_pc', small_vars, 'SMALL MODEL (COL 1)')
results_medium = run_model(df, 'lrgdpna_pc', medium_vars, 'MEDIUM MODEL (COL 2)')
results_large = run_model(df, 'lrgdpna_pc', large_vars, 'LARGE MODEL (COL 3)')

SMALL MODEL (COL 1)
observations: 1507
countries: 168
r2 (within): 0.897

coefficients:
----------------------------------------------------------------------
lagdependent        :     0.7985***  (0.0241)
linflation_na       :    -0.2282***  (0.0424)
lrer                :     0.0030     (0.0027)
ltraderesid         :     0.0819***  (0.0261)
infra_index         :     0.0819***  (0.0159)
dum_fincrisis       :    -0.0467***  (0.0090)
sd_temperature      :    -0.0235     (0.0198)
dltot               :    -0.0256     (0.0398)
lkg                 :    -0.0469**   (0.0184)
sd_growth           :    -0.6218***  (0.2357)
durbanpop           :     0.0104***  (0.0040)
const               :     1.9220***  (0.2019)
MEDIUM MODEL (COL 2)
observations: 967
countries: 149
r2 (within): 0.894

coefficients:
----------------------------------------------------------------------
lagdependent        :     0.8022***  (0.0284)
linflation_na       :    -0.1108*    (0.0599)
lrer                :     0.0052**   (

# model extension (regularized regression - task d)

In [7]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler

In [8]:
def prepare_panel_for_regularization(data, dep_var, indep_vars):
    ''' 
    prepare panel data fro regularized regression by: 
    1. selecting complete cases 
    2. applying within-transformation (demeaning by country) to handle fixed effects 
    3. standardizing vaiables for proper penalization 
    ''' 

    cols = [dep_var] + indep_vars
    subset = data[cols].dropna().copy()

    #within trans: demean by country (eqv to fixed fx)
    subset_demeaned = subset.groupby('country').transform(lambda x: x - x.mean())

    #add period dummies (dont demean these, they handle time fixed fx)
    subset_reset = subset.reset_index()
    period_dummies = pd.get_dummies(subset_reset['period'], prefix='period', drop_first=True)
    period_dummies.index = subset_demeaned.index 

    #prep y and X
    y = subset_demeaned[dep_var].values 
    X_vars = subset_demeaned[indep_vars]
    X = pd.concat([X_vars, period_dummies], axis=1)

    #standardize X (important for reg, puts all vars on same scale)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, X.columns.tolist(), scaler, subset_demeaned[indep_vars]

In [9]:
def run_regularized_model(data, dep_var, indep_vars, model_name, method='ridge'):
    ''' 
    run regularized regression with CV to select penalty param 
    methods: one of ridge, lasso, elasticnet 
    ''' 

    X_scaled, y, col_names, scaler, X_unstandardized = prepare_panel_for_regularization(
        data, dep_var, indep_vars
    )

    #choose reg method 
    if method == 'ridge':
        model = RidgeCV(alphas=np.logspace(-4, 4, 50), cv=5)
    elif method == 'lasso': 
        model = LassoCV(alphas=np.logspace(-4, 1, 50), cv=5, max_iter=10_000)
    elif method == 'elasicnet':
        model = ElasticNetCV(alphas=np.logspace(-4, 1, 50), l1_ratio=[0.1, 0.5, 0.9], cv=5, max_iter=10_000)

    model.fit(X_scaled, y)

    #get coefficients (for the non-period-dummy vars)
    n_vars = len(indep_vars)
    coefs_scaled = model.coef_[:n_vars]

    #convert back to original scale for interpretability
    #beta_original = b_scaled * (sig_y / sig_x)
    y_std = np.std(y)
    x_stds = scaler.scale_[:n_vars]
    coefs_original = coefs_scaled * (y_std / x_stds)

    print(f'{"="*70}')
    print(f'{model_name} ({method.upper()})')
    print(f'{"="*70}')
    print(f'optimal alpha (penalty): {model.alpha_:.4f}')
    print(f'r2: {model.score(X_scaled, y):.3f}')
    print(F'\ncoefficients (standardized - shows relative importance):')
    print(f'{"-"*70}')

    #sort by abs value to show most important 
    coef_df = pd.DataFrame({
        'variable' : indep_vars,
        'coef_standardized' : coefs_scaled,
        'coef_original_scale' : coefs_original
    })

    for _, row in coef_df.iterrows():
        print(f'{row["variable"]:20s}: {row["coef_standardized"]:10.4f} (original scale: {row["coef_original_scale"]:.4f})')

    return model, coef_df 

# run ridge regression for all three models 

In [10]:
print(f'RIDGE REGRESSION RESULTS')
print(f'{"="*70}')

ridge_small, coef_small = run_regularized_model(
    df, 'lrgdpna_pc', small_vars, 'SMALL MODEL', method='ridge'
)
ridge_medium, coef_medium = run_regularized_model(
    df, 'lrgdpna_pc', medium_vars, 'MEDIUM MODEL', method='ridge'
)
ridge_large, coef_large = run_regularized_model(
    df, 'lrgdpna_pc', large_vars, 'LARGE MODEL', method='ridge'
)

RIDGE REGRESSION RESULTS
SMALL MODEL (RIDGE)
optimal alpha (penalty): 1.7575
r2: 0.897

coefficients (standardized - shows relative importance):
----------------------------------------------------------------------
lagdependent        :     0.2819 (original scale: 0.2754)
linflation_na       :    -0.0271 (original scale: -0.0791)
lrer                :     0.0074 (original scale: 0.0010)
ltraderesid         :     0.0226 (original scale: 0.0283)
infra_index         :     0.0450 (original scale: 0.0285)
dum_fincrisis       :    -0.0178 (original scale: -0.0162)
sd_temperature      :    -0.0037 (original scale: -0.0082)
dltot               :    -0.0029 (original scale: -0.0089)
lkg                 :    -0.0141 (original scale: -0.0162)
sd_growth           :    -0.0206 (original scale: -0.2147)
durbanpop           :     0.0144 (original scale: 0.0036)
MEDIUM MODEL (RIDGE)
optimal alpha (penalty): 1.2068
r2: 0.893

coefficients (standardized - shows relative importance):
-------------------

# compare OLS vs ridge

In [11]:
def compare_ols_ridge(ols_results, ridge_coef_df, indep_vars, model_name):
    ''' 
    create side-by-side comparison of OLS and ridge coeff 
    ''' 

    comparison = []
    for var in indep_vars:
        ols_coef = ols_results.params[var] if var in ols_results.params.index else np.nan
        ols_se = ols_results.std_errors[var] if var in ols_results.std_errors.index else np.nan
        ols_pval = ols_results.pvalues[var] if var in ols_results.pvalues.index else np.nan

        ridge_row = ridge_coef_df[ridge_coef_df['variable'] == var]
        ridge_coef = ridge_row['coef_original_scale'].values[0] if len(ridge_row) > 0 else np.nan

        #calc shrinkage 
        if not np.isnan(ols_coef) and not np.isnan(ridge_coef) and ols_coef != 0:
            shrinkage = (1 - abs(ridge_coef) / abs(ols_coef)) * 100
        else:
            shrinkage = np.nan 

        comparison.append({
            'variable' : var,
            'OLS': f'{ols_coef:.4f}',
            'OLS_SE': f'({ols_se:.4f})',
            'ridge': f'{ridge_coef:.4f}',
            'shrinkage %': f'{shrinkage:.1f}%' if not np.isnan(shrinkage) else 'N/A'
        })

    comp_df = pd.DataFrame(comparison)
    print(f'\n{"="*70}')
    print(f'COMPARISON: OLS vs RIDGE - {model_name}')
    print(f'{"="*70}')
    print(comp_df.to_string(index=False))

    return comp_df 


In [12]:
comp_small = compare_ols_ridge(results_small, coef_small, small_vars, 'SMALL MODEL')
comp_medium = compare_ols_ridge(results_medium, coef_medium, medium_vars, 'MEDIUM MODEL')
comp_large = compare_ols_ridge(results_large, coef_large, large_vars, 'LARGE MODEL')


COMPARISON: OLS vs RIDGE - SMALL MODEL
      variable     OLS   OLS_SE   ridge shrinkage %
  lagdependent  0.7985 (0.0241)  0.2754       65.5%
 linflation_na -0.2282 (0.0424) -0.0791       65.3%
          lrer  0.0030 (0.0027)  0.0010       65.1%
   ltraderesid  0.0819 (0.0261)  0.0283       65.4%
   infra_index  0.0819 (0.0159)  0.0285       65.2%
 dum_fincrisis -0.0467 (0.0090) -0.0162       65.3%
sd_temperature -0.0235 (0.0198) -0.0082       65.4%
         dltot -0.0256 (0.0398) -0.0089       65.2%
           lkg -0.0469 (0.0184) -0.0162       65.4%
     sd_growth -0.6218 (0.2357) -0.2147       65.5%
     durbanpop  0.0104 (0.0040)  0.0036       65.6%

COMPARISON: OLS vs RIDGE - MEDIUM MODEL
      variable     OLS   OLS_SE   ridge shrinkage %
  lagdependent  0.8022 (0.0284)  0.2443       69.5%
 linflation_na -0.1108 (0.0599) -0.0352       68.2%
          lrer  0.0052 (0.0024)  0.0015       71.5%
   ltraderesid  0.1214 (0.0453)  0.0364       70.0%
   infra_index  0.0723 (0.0207)  0.

# lasso regression for variable selection perspective

In [13]:
lasso_large, lasso_coef_large = run_regularized_model(
    df, 'lrgdpna_pc', large_vars, 'LARGE MODEL', method='lasso'
)

LARGE MODEL (LASSO)
optimal alpha (penalty): 0.0017
r2: 0.920

coefficients (standardized - shows relative importance):
----------------------------------------------------------------------
lagdependent        :     0.1964 (original scale: 0.1920)
linflation_na       :    -0.0122 (original scale: -0.0318)
lrer                :     0.0207 (original scale: 0.0024)
ltraderesid         :     0.0133 (original scale: 0.0251)
infra_index         :     0.0376 (original scale: 0.0248)
dum_fincrisis       :    -0.0084 (original scale: -0.0059)
sd_temperature      :    -0.0049 (original scale: -0.0088)
dltot               :    -0.0025 (original scale: -0.0064)
lkg                 :     0.0035 (original scale: 0.0033)
sd_growth           :    -0.0052 (original scale: -0.0512)
durbanpop           :    -0.0051 (original scale: -0.0015)
lcredit             :     0.0062 (original scale: 0.0039)
lFDIstock_ipol      :     0.0093 (original scale: 0.0037)
lEDI_ipol           :    -0.0000 (original scale:

# show which vars lasso keeps vs drops

In [14]:
print("\nlasso var selection (large model):")
print("-" * 50)
for _, row in lasso_coef_large.iterrows():
    status = "KEPT" if abs(row['coef_standardized']) > 0.001 else "DROPPED"
    print(f"{row['variable']:20s}: {row['coef_standardized']:8.4f} [{status}]")



lasso var selection (large model):
--------------------------------------------------
lagdependent        :   0.1964 [KEPT]
linflation_na       :  -0.0122 [KEPT]
lrer                :   0.0207 [KEPT]
ltraderesid         :   0.0133 [KEPT]
infra_index         :   0.0376 [KEPT]
dum_fincrisis       :  -0.0084 [KEPT]
sd_temperature      :  -0.0049 [KEPT]
dltot               :  -0.0025 [KEPT]
lkg                 :   0.0035 [KEPT]
sd_growth           :  -0.0052 [KEPT]
durbanpop           :  -0.0051 [KEPT]
lcredit             :   0.0062 [KEPT]
lFDIstock_ipol      :   0.0093 [KEPT]
lEDI_ipol           :  -0.0000 [DROPPED]
lEDI_ipol_sq        :   0.0006 [DROPPED]
actotal             :  -0.0161 [KEPT]
lhc                 :   0.0128 [KEPT]
dgini               :  -0.0094 [KEPT]


# growth decomposition for norway (periods 8-9)

In [15]:
norway = df.loc['Norway'].copy()

#get coeff from small model 
params = results_small.params

#def vars and names 
variables = {
    'linflation_na': 'Inflation',
    'lrer': 'Real exchange rate',
    'ltraderesid': 'Trade openness',
    'infra_index': 'Infrastructure',
    'dum_fincrisis': 'Financial crisis',
    'sd_temperature': 'Climate change',
    'dltot': 'Terms of trade changes',
    'lkg': 'Government consumption',
    'sd_growth': 'Growth volatility',
    'durbanpop': 'Change in urban pop.'
}

period_7 = norway.loc[7]
period_8 = norway.loc[8]
period_9 = norway.loc[9]

growth_7_8 = period_8['lrgdpna_pc'] - period_7['lrgdpna_pc']
actual_growth = period_9['lrgdpna_pc'] - period_8['lrgdpna_pc']


In [16]:
contributions = []
for var, name in variables.items():
    coef = params[var]
    val_8 = period_8[var]
    val_9 = period_9[var]
    diff = val_9 - val_8
    contribution = coef * diff
    
    contributions.append({
        'Variable': name,
        'var_code': var,
        'Parameter': coef,
        'Period_8': val_8,
        'Period_9': val_9,
        'Difference': diff,
        'Contribution': contribution,
        'Contribution_pct': contribution * 100
    })


contrib_df = pd.DataFrame(contributions)

subtotal = contrib_df['Contribution'].sum()

#persistence = theta * g
theta = params['lagdependent']
persistence = theta * growth_7_8

period_8_dummy = params.get('period_8', 0)
period_9_dummy = params.get('period_9', 0)
period_dummy_contrib = period_9_dummy - period_8_dummy

predicted_growth = subtotal + persistence + period_dummy_contrib
residual = actual_growth - predicted_growth 

In [17]:
print("=" * 90)
print("table 3")
print("=" * 90)
print(f"{'variable':<30} {'(1)':<10} {'(2)':<12} {'(3)':<12} {'(4)':<12} {'(5)':<12}")
print(f"{'':<30} {'parameter':<10} {'period 8':<12} {'period 9':<12} {'diff':<12} {'contrib.':<12}")
print("-" * 90)

for _, row in contrib_df.iterrows():
    print(f"{row['Variable']:<30} {row['Parameter']:<10.3f} {row['Period_8']:<12.3f} {row['Period_9']:<12.3f} {row['Difference']:<12.3f} {row['Contribution_pct']:<11.2f}%")

print("-" * 90)
print(f"{'SUBTOTAL':<78} {subtotal*100:.2f}%")
print(f"{'+ persistence':<30} {theta:<10.3f} {'(growth 7-8)':<24} {growth_7_8:<12.4f} {persistence*100:.2f}%")
print(f"{'+ period dummy':<30} {'(=1)':<10} {period_8_dummy:<12.4f} {period_9_dummy:<12.4f} {period_dummy_contrib:<12.4f} {period_dummy_contrib*100:.2f}%")
print("-" * 90)
print(f"{'SUM (predicted growth, 5-year)':<78} {predicted_growth*100:.2f}%")
print(f"{'ANNUALIZED predicted':<78} {predicted_growth*100/5:.2f}% p.a.")
print("-" * 90)
print(f"{'ACTUAL growth (5-year)':<78} {actual_growth*100:.2f}%")
print(f"{'ANNUALIZED actual':<78} {actual_growth*100/5:.2f}% p.a.")
print(f"{'RESIDUAL':<78} {residual*100:.2f}%")
print("=" * 90)


table 3
variable                       (1)        (2)          (3)          (4)          (5)         
                               parameter  period 8     period 9     diff         contrib.    
------------------------------------------------------------------------------------------
Inflation                      -0.228     0.020        0.017        -0.003       0.07       %
Real exchange rate             0.003      -1.816       -1.634       0.182        0.05       %
Trade openness                 0.082      0.836        0.762        -0.074       -0.60      %
Infrastructure                 0.082      1.014        1.069        0.056        0.46       %
Financial crisis               -0.047     0.000        0.000        0.000        -0.00      %
Climate change                 -0.024     7.228        7.789        0.561        -1.32      %
Terms of trade changes         -0.026     0.014        0.011        -0.003       0.01       %
Government consumption         -0.047     -2.190       

# export data for excel graphing (matplotlib is too hard ðŸ¥°)

In [18]:
excel_data = pd.DataFrame({
    'Component': ['Contemporaneous changes', 'Persistence', 'Period dummy', 'Residual', 'Actual growth'],
    'Value_5yr_pct': [subtotal*100, persistence*100, period_dummy_contrib*100, residual*100, actual_growth*100],
    'Value_annual_pct': [subtotal*100/5, persistence*100/5, period_dummy_contrib*100/5, residual*100/5, actual_growth*100/5]
})


print(excel_data.to_string(index=False))

excel_data.to_csv('norway_growth_contributions.csv', index=False)
contrib_df.to_csv('norway_decomposition_details.csv', index=False)

              Component  Value_5yr_pct  Value_annual_pct
Contemporaneous changes      -2.026165         -0.405233
            Persistence       5.524862          1.104972
           Period dummy      -3.520896         -0.704179
               Residual      -1.496801         -0.299360
          Actual growth      -1.519000         -0.303800
