# Notebook 2: Conducting and Evaluating Regression Analysis

In [25]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd
import sklearn
from src import utils

np.set_printoptions(suppress=True)
from sklearn.linear_model import LinearRegression

np.random.seed(2)

Import datasets that were preprocessed in Notebook 1

In [2]:
wb_data = pd.read_csv("data/wb_data.csv")
wb_data_short = pd.read_csv("data/wb_data_short.csv")
whr_data = pd.read_csv("data/whr_data.csv")

wb_data.index = wb_data["Country Name"]
wb_data.drop(columns=["Country Name"], inplace=True)
wb_data_short.index = wb_data_short["Country Name"]
wb_data_short.drop(columns=["Country Name"], inplace=True)


whr_data.index = whr_data["Country name"]
whr_data.drop(columns=["Country name"], inplace=True)
#whr_data.head(20)

# sort by index
wb_data.sort_index(inplace=True)
wb_data_short.sort_index(inplace=True)
whr_data.sort_index(inplace=True)

In [3]:
# test: are the datasets equal
print(sorted(list(wb_data.index))==sorted(list(whr_data.index)))

True


Standardize world bank data: $\frac{x-\mu}{\sigma}$

In [4]:
from sklearn.preprocessing import StandardScaler

wb_data_st = wb_data.copy(deep=True)
wb_data_short_st = wb_data_short.copy(deep=True)

wb_data_st[:] = StandardScaler().fit_transform(wb_data)
wb_data_short_st[:] = StandardScaler().fit_transform(wb_data_short)

In [5]:
print(type(wb_data_st), type(wb_data_short_st))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [6]:
# test: are the datasets equal
print(sorted(list(wb_data.index))==sorted(list(whr_data.index)))

True


In [7]:
# drop everything but life satisfaction ladder score from whr data
whr_scores = whr_data["Ladder score"]

## Pearson correlation coefficients
Aim: Get the correlation coefficient of each indicator with the whr_data

In [8]:
import scipy.stats

indicator_corr = wb_data_short.corr(method="pearson")
indicator_corr
indicator_corr[indicator_corr>0.8]

threshold = 0.85

for name, values in indicator_corr.iteritems():
    print()
    print('\nTarget indicator: ', name)
    print('Correlated Indicators:')
    for i in range(0, indicator_corr.shape[1]):
        if threshold < values[i] < 1:
            name = indicator_corr.columns[i]
            print('{name}: {value}'.format(name=name, value=values[i]))

indicator_corr
#TODO function that creates pretty colored corr matrix



Target indicator:  Access to electricity (% of population)
Correlated Indicators:
Access to electricity, urban (% of urban population): 0.8917125593423901


Target indicator:  Access to electricity, urban (% of urban population)
Correlated Indicators:
Access to electricity (% of population): 0.8917125593423901


Target indicator:  Adjusted savings: energy depletion (current US$)
Correlated Indicators:


Target indicator:  Adjusted savings: mineral depletion (current US$)
Correlated Indicators:


Target indicator:  Adolescent fertility rate (births per 1,000 women ages 15-19)
Correlated Indicators:
Age dependency ratio, young (% of working-age population): 0.8517896042645


Target indicator:  Age dependency ratio, old (% of working-age population)
Correlated Indicators:
Population ages 55-59, male (% of male population): 0.8559541006519372
Population ages 60-64, female (% of female population): 0.8945384584882671
Population ages 60-64, male (% of male population): 0.9064395146404964
P

Unnamed: 0,Access to electricity (% of population),"Access to electricity, urban (% of urban population)",Adjusted savings: energy depletion (current US$),Adjusted savings: mineral depletion (current US$),"Adolescent fertility rate (births per 1,000 women ages 15-19)","Age dependency ratio, old (% of working-age population)","Age dependency ratio, young (% of working-age population)","Birth rate, crude (per 1,000 people)","Contributing family workers, female (% of female employment) (modeled ILO estimate)","Contributing family workers, male (% of male employment) (modeled ILO estimate)",...,"Unemployment, female (% of female labor force) (modeled ILO estimate)","Unemployment, male (% of male labor force) (modeled ILO estimate)","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)",Urban population (% of total population),Urban population growth (annual %),"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)"
Access to electricity (% of population),1.000000,0.891713,0.160839,0.105644,-0.740773,0.523290,-0.828761,-0.811916,-0.540893,-0.601325,...,0.103030,0.058894,0.213829,0.203132,0.598118,-0.671482,-0.779662,-0.761876,0.774838,0.747612
"Access to electricity, urban (% of urban population)",0.891713,1.000000,0.124989,0.075946,-0.647775,0.402567,-0.684991,-0.670212,-0.459203,-0.475212,...,0.082769,0.025155,0.190671,0.150849,0.482946,-0.507225,-0.639917,-0.643312,0.636859,0.633030
Adjusted savings: energy depletion (current US$),0.160839,0.124989,1.000000,0.433387,-0.157275,0.009812,-0.152066,-0.149840,-0.125559,-0.144647,...,0.030175,-0.073687,0.102990,0.006815,0.184604,-0.098769,-0.178099,-0.178707,0.183051,0.194687
Adjusted savings: mineral depletion (current US$),0.105644,0.075946,0.433387,1.000000,-0.128784,0.045110,-0.119271,-0.122057,-0.012994,-0.055257,...,-0.045158,0.005189,-0.017388,0.041837,0.062962,-0.021018,-0.029289,-0.010006,0.027631,0.006622
"Adolescent fertility rate (births per 1,000 women ages 15-19)",-0.740773,-0.647775,-0.157275,-0.128784,1.000000,-0.642149,0.851790,0.840966,0.472296,0.560582,...,-0.044021,-0.059791,-0.139147,-0.210425,-0.526025,0.650080,0.795840,0.774322,-0.795792,-0.769994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Urban population growth (annual %),-0.671482,-0.507225,-0.098769,-0.021018,0.650080,-0.731605,0.801519,0.821776,0.515400,0.554892,...,-0.048523,-0.131250,-0.150398,-0.243170,-0.517612,1.000000,0.717432,0.657504,-0.715288,-0.650718
"Vulnerable employment, female (% of female employment) (modeled ILO estimate)",-0.779662,-0.639917,-0.178099,-0.029289,0.795840,-0.682698,0.833208,0.833539,0.754944,0.755263,...,-0.182673,-0.155460,-0.276438,-0.287490,-0.715048,0.717432,1.000000,0.964274,-0.999057,-0.959451
"Vulnerable employment, male (% of male employment) (modeled ILO estimate)",-0.761876,-0.643312,-0.178707,-0.010006,0.774322,-0.642693,0.806882,0.805703,0.765099,0.775011,...,-0.162966,-0.147758,-0.244519,-0.260594,-0.689344,0.657504,0.964274,1.000000,-0.961778,-0.993946
"Wage and salaried workers, female (% of female employment) (modeled ILO estimate)",0.774838,0.636859,0.183051,0.027631,-0.795792,0.678495,-0.828350,-0.828061,-0.754634,-0.756773,...,0.181031,0.151989,0.275226,0.284088,0.710290,-0.715288,-0.999057,-0.961778,1.000000,0.960930


Finding the VIF value for each indicator.

In [26]:
wb_vif = utils.sklearn_vif(wb_data.columns, wb_data)

  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)


Remove indicators that are too strongly correlated 

In [11]:
cc_indicators = list(wb_vif["VIF"].sort_values()[15:156].index)

wb_data_cc = wb_data.drop(columns=cc_indicators)
wb_data_cc.shape

(150, 15)

In [12]:
#TODO test whether 2-step reduction makes a differnece!

In [13]:
utils.sklearn_vif(wb_data_cc.columns, wb_data_cc)

Unnamed: 0,VIF,Tolerance
"Access to electricity, urban (% of urban population)",1.7,0.5886
Adjusted savings: mineral depletion (current US$),1.2,0.8191
"Incidence of tuberculosis (per 100,000 people)",1.7,0.6015
"Merchandise exports by the reporting economy, residual (% of total merchandise exports)",1.2,0.8213
Merchandise exports to economies in the Arab World (% of total merchandise exports),1.2,0.8662
Merchandise exports to high-income economies (% of total merchandise exports),1.6,0.6288
Merchandise exports to low- and middle-income economies in South Asia (% of total merchandise exports),1.4,0.7248
"Merchandise imports by the reporting economy, residual (% of total merchandise imports)",1.1,0.9125
Merchandise imports from low- and middle-income economies outside region (% of total merchandise imports),1.4,0.6996
Population density (people per sq. km of land area),1.6,0.6393


In [14]:
wb_data_cor = wb_data.corr()
wb_inv_corr = pd.DataFrame(np.linalg.inv(wb_data_cor.values), index = wb_data_cor.index, columns=wb_data_cor.columns)
#wb_inv_corr.diagonals

In [15]:
for name, values in indicator_corr.iteritems():
    print()
    print('\nTarget indicator: ', name)
    print('Correlated Indicators:')
    for i in range(0, indicator_corr.shape[1]):
        if threshold < values[i] < 1:
            name = indicator_corr.columns[i]
            print('{name}: {value}'.format(name=name, value=values[i]))



Target indicator:  Access to electricity (% of population)
Correlated Indicators:
Access to electricity, urban (% of urban population): 0.8917125593423901


Target indicator:  Access to electricity, urban (% of urban population)
Correlated Indicators:
Access to electricity (% of population): 0.8917125593423901


Target indicator:  Adjusted savings: energy depletion (current US$)
Correlated Indicators:


Target indicator:  Adjusted savings: mineral depletion (current US$)
Correlated Indicators:


Target indicator:  Adolescent fertility rate (births per 1,000 women ages 15-19)
Correlated Indicators:
Age dependency ratio, young (% of working-age population): 0.8517896042645


Target indicator:  Age dependency ratio, old (% of working-age population)
Correlated Indicators:
Population ages 55-59, male (% of male population): 0.8559541006519372
Population ages 60-64, female (% of female population): 0.8945384584882671
Population ages 60-64, male (% of male population): 0.9064395146404964
P

## Split data into train and test set

In [27]:
test_size = 30
train, test, train_gt, test_gt = utils.split_data(wb_data_short_st, whr_scores, test_size)

In [17]:
print(train.shape, test.shape, train_gt.shape, test_gt.shape)
print(list(test.index)==list(test_gt.index))

(120, 120) (30, 120) (120,) (30,)
True


## Linear regression

Let's see how linear regression performs on wb_data and wb_data_short (redundant indicators removed).

In [19]:
# lets remove sex ratio lol
wb_data_cc = wb_data_cc.drop("Sex ratio at birth (male births per female births)", axis=1)

normalization worsens error from 0.7 to 1.3

In [20]:
wb_data_cc_norm = wb_data_cc.copy(deep=True)
wb_data_cc_norm[:] = sklearn.preprocessing.normalize(wb_data_cc)


In [28]:
loss_list, coef_list = utils.n_fold_ceval(1500, wb_data_cc, whr_scores, 30)
loss_arr = np.array(loss_list)
mean_loss = loss_arr.mean()

print(mean_loss)

0.7152410658941222


In [22]:
avg_coefs = np.around(np.mean(coef_list, axis=0), 2)

wb_data_cc.columns
#avg_coefs

Index(['Access to electricity, urban (% of urban population)',
       'Adjusted savings: mineral depletion (current US$)',
       'Incidence of tuberculosis (per 100,000 people)',
       'Merchandise exports by the reporting economy, residual (% of total merchandise exports)',
       'Merchandise exports to economies in the Arab World (% of total merchandise exports)',
       'Merchandise exports to high-income economies (% of total merchandise exports)',
       'Merchandise exports to low- and middle-income economies in South Asia (% of total merchandise exports)',
       'Merchandise imports by the reporting economy, residual (% of total merchandise imports)',
       'Merchandise imports from low- and middle-income economies outside region (% of total merchandise imports)',
       'Population density (people per sq. km of land area)',
       'Secondary education, duration (years)',
       'Secure Internet servers (per 1 million people)',
       'Statistical performance indicators (SP

In [23]:
avg_coefs[-12]
#avg_coefs

-0.0

For high n, the mse-loss is around 5. We print the first 50 entries of the loss array to check for outliers. It turns out that the variance is quite large and the loss is roughly in a range of [1, 25]