# Notebook 3: Dealing with Multicolinearity

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd
import sklearn
from src import ana_utils as utils
import collections

#np.set_printoptions(suppress=True)
from sklearn.linear_model import LinearRegression

np.random.seed(7)

Import datasets that were preprocessed in Notebook 1

In [2]:
wb_data = pd.read_csv("data/wb_data.csv", index_col="Country Name")
wb_data_short = pd.read_csv("data/wb_data_short.csv", index_col="Country Name")
whr_data = pd.read_csv("data/whr_data.csv", index_col="Country name")

# sort by index
wb_data.sort_index(inplace=True)
wb_data_short.sort_index(inplace=True)
whr_data.sort_index(inplace=True)

# drop everything but life satisfaction ladder score from whr data
whr_scores = whr_data["Ladder score"]

### Pearson correlation coefficient analysis

First, we wanted to get an impression of the amount of multicolinearity present in the data. Therefore, we looked at the correlation coefficient of each indicator pair.

In [15]:
import scipy.stats

indicator_corr = wb_data.corr(method="pearson")
#indicator_corr
#indicator_corr[indicator_corr>0.8]

#We build a function that prints all correlated pairs above a certain threshold
threshold = 0.85
corr_dict = utils.corr_counter(indicator_corr, threshold, verbose=False)

corr_dict_sorted = sorted(corr_dict.items(), key=lambda x: x[1], reverse=True) 

corr_dict_sorted
#indicators_sorted_by_corr = sorted(corr_dict, key=corr_dict.get)

#TODO ? visualize

[('Adjusted savings: energy depletion (current US$)', 0),
 ('Adjusted savings: mineral depletion (current US$)', 0),
 ('Death rate, crude (per 1,000 people)', 0),
 ('Employment in industry, female (% of female employment) (modeled ILO estimate)',
  0),
 ('Incidence of tuberculosis (per 100,000 people)', 0),
 ('Labor force participation rate, male (% of male population ages 15-64) (modeled ILO estimate)',
  0),
 ('Land area (sq. km)', 0),
 ('Lower secondary school starting age (years)', 0),
 ('Merchandise exports by the reporting economy, residual (% of total merchandise exports)',
  0),
 ('Merchandise exports to economies in the Arab World (% of total merchandise exports)',
  0),
 ('Merchandise exports to high-income economies (% of total merchandise exports)',
  0),
 ('Merchandise exports to low- and middle-income economies in East Asia & Pacific (% of total merchandise exports)',
  0),
 ('Merchandise exports to low- and middle-income economies in Europe & Central Asia (% of total mer

#Upon further research, we find that instead of removing redundancies

In [4]:
wb_vif = utils.sklearn_vif(wb_data.columns, wb_data)

  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)
  vif = 1/(1 - r_squared)


In [5]:
cc_indicators = list(wb_vif["VIF"].sort_values()[15:156].index)

wb_data_cc = wb_data.drop(columns=cc_indicators)
wb_data_cc.shape

(150, 15)

In [6]:
utils.sklearn_vif(wb_data_cc.columns, wb_data_cc)

Unnamed: 0,VIF,Tolerance
"Access to electricity, urban (% of urban population)",1.7,0.5886
Adjusted savings: mineral depletion (current US$),1.2,0.8191
"Incidence of tuberculosis (per 100,000 people)",1.7,0.6015
"Merchandise exports by the reporting economy, residual (% of total merchandise exports)",1.2,0.8213
Merchandise exports to economies in the Arab World (% of total merchandise exports),1.2,0.8662
Merchandise exports to high-income economies (% of total merchandise exports),1.6,0.6288
Merchandise exports to low- and middle-income economies in South Asia (% of total merchandise exports),1.4,0.7248
"Merchandise imports by the reporting economy, residual (% of total merchandise imports)",1.1,0.9125
Merchandise imports from low- and middle-income economies outside region (% of total merchandise imports),1.4,0.6996
Population density (people per sq. km of land area),1.6,0.6393


In [7]:
# lets remove sex ratio lol
wb_data_cc = wb_data_cc.drop("Sex ratio at birth (male births per female births)", axis=1)

In [8]:
# For the full dense indicator data
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(2000, wb_data, whr_scores, 30, "no_scaling")

print("Mean loss:", mean_loss)
print("The average size of the first ten coefficients:", avg_coefs[:10])

Mean loss: 14.448144518650265
The average size of the first ten coefficients: [ 0.0153 -0.0082 -0.     -0.     -0.0029  0.0634 -0.0099  0.0733  0.0773
 -0.0862]
