# Notebook 2: Conducting and Evaluating Regression Analysis

In [15]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd
import sklearn
from src import ana_utils as utils

#np.set_printoptions(suppress=True)
from sklearn.linear_model import LinearRegression

np.random.seed(7)

Import datasets that were preprocessed in Notebook 1

In [17]:
wb_data = pd.read_csv("data/wb_data.csv", index_col="Country Name")
wb_data_short = pd.read_csv("data/wb_data_short.csv", index_col="Country Name")
whr_data = pd.read_csv("data/whr_data.csv", index_col="Country name")

# sort by index
wb_data.sort_index(inplace=True)
wb_data_short.sort_index(inplace=True)
whr_data.sort_index(inplace=True)

# drop everything but life satisfaction ladder score from whr data
whr_scores = whr_data["Ladder score"]

# test: are the same countries present in each dataset?
print(sorted(list(wb_data.index))==sorted(list(whr_data.index)))

True


Split data into train and test set. We choose a 80/20 split, i.e. 120 countries in the training set and 30 countries in the test set.

In [None]:
test_size = 30
train, test, train_gt, test_gt = utils.split_data(wb_data_short, whr_scores, test_size)

# verify set shapes
print(train.shape, test.shape, train_gt.shape, test_gt.shape)

#verify that data prder and groundtruth order and indices match 
print(list(train.index)==list(train_gt.index), list(test.index)==list(test_gt.index))


(120, 120) (30, 120) (120,) (30,)
True True


## Linear regression


Let's see how linear regression performs on wb_data and wb_data_short (redundant indicators removed). We choose 2000-fold validation after noticing quite some variance for lower n.

In [None]:
# For the full dense indicator data
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(2000, wb_data, whr_scores, 30, "no_scaling")

print("Mean loss:", mean_loss)
print("The average size of the first ten coefficients:", avg_coefs[:10])

Mean loss: 21.477142963153113
The average size of the first ten coefficients: [ 0.0182 -0.0105 -0.     -0.     -0.0025  0.0609 -0.0282  0.0891  0.0918
 -0.0928]


In [None]:
# For the indicators with manually removed redundancies
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(2000, wb_data_short, whr_scores, 30, "no_scaling")

print("Mean loss:", mean_loss)
print("The average size of the first ten coefficients:", avg_coefs[:10])

Mean loss: 5.210962045770421
The average size of the first ten coefficients: [ 0.02   -0.0067  0.     -0.     -0.0054  0.4548  0.188   0.0804 -0.0221
  0.014 ]


While linear regression performs better after manually removing redundancies, both of the results are still quite poor. 
We suspect multicolinearity to be a main reason for bad performance. 

But before starting to deal with multicolinearity, we want to normalize/standardize the data. This is because, in the end, we aim to compare coefficients. Hence, performing analysis also on the normalized/standardized data along the way is necessary to prevent us from developing a model that works only on non-normalized/non-standardized data.

### Normalization 

worsens error from 0.7 to 1.3

In [None]:
wb_data_cc_norm = wb_data_cc.copy(deep=True)
wb_data_cc_norm[:] = sklearn.preprocessing.normalize(wb_data_cc)


### Standardization

Standardize world bank data: $\frac{x-\mu}{\sigma}$

In [None]:
from sklearn.preprocessing import StandardScaler

wb_data_st = wb_data.copy(deep=True)
wb_data_short_st = wb_data_short.copy(deep=True)

wb_data_st[:] = StandardScaler().fit_transform(wb_data)
wb_data_short_st[:] = StandardScaler().fit_transform(wb_data_short)

## Pearson correlation coefficients

Get the correlation coefficient of each indicator

In [None]:
import scipy.stats

indicator_corr = wb_data_short.corr(method="pearson")
indicator_corr
indicator_corr[indicator_corr>0.8]

threshold = 0.85
#utils.print_corr(indicator_corr, threshold)



Target indicator:  Access to electricity (% of population)
Correlated Indicators:
Access to electricity, urban (% of urban population): 0.8917125593423901


Target indicator:  Access to electricity, urban (% of urban population)
Correlated Indicators:
Access to electricity (% of population): 0.8917125593423901


Target indicator:  Adjusted savings: energy depletion (current US$)
Correlated Indicators:


Target indicator:  Adjusted savings: mineral depletion (current US$)
Correlated Indicators:


Target indicator:  Adolescent fertility rate (births per 1,000 women ages 15-19)
Correlated Indicators:
Age dependency ratio, young (% of working-age population): 0.8517896042645


Target indicator:  Age dependency ratio, old (% of working-age population)
Correlated Indicators:
Population ages 55-59, male (% of male population): 0.8559541006519372
Population ages 60-64, female (% of female population): 0.8945384584882671
Population ages 60-64, male (% of male population): 0.9064395146404964
P

In [None]:
wb_data_cor = wb_data.corr()
wb_inv_corr = pd.DataFrame(np.linalg.inv(wb_data_cor.values), index = wb_data_cor.index, columns=wb_data_cor.columns)
#wb_inv_corr.diagonals
#print_correlations(indicator_corr)

In [None]:
wb_vif = utils.sklearn_vif(wb_data.columns, wb_data)

  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)


In [None]:
cc_indicators = list(wb_vif["VIF"].sort_values()[15:156].index)

wb_data_cc = wb_data.drop(columns=cc_indicators)
wb_data_cc.shape

(150, 15)

In [None]:
utils.sklearn_vif(wb_data_cc.columns, wb_data_cc)

In [None]:
# lets remove sex ratio lol
wb_data_cc = wb_data_cc.drop("Sex ratio at birth (male births per female births)", axis=1)