# Notebook 2: Conducting and Evaluating Regression Analysis

In [222]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd
import sklearn

np.random.seed(1)

Import datasets that were preprocessed in Notebook 1

In [223]:
wb_data = pd.read_csv("data/wb_data.csv")
wb_data_short = pd.read_csv("data/wb_data_short.csv")
whr_data = pd.read_csv("data/whr_data.csv")

wb_data.index = wb_data["Country Name"]
wb_data.drop(columns=["Country Name", "Country Name.1"], inplace=True)
wb_data_short.index = wb_data_short["Country Name"]
wb_data_short.drop(columns=["Country Name", "Country Name.1"], inplace=True)


whr_data.index = whr_data["Country name"]
whr_data.drop(columns=["Country name", "Country name.1"], inplace=True)
#whr_data.head(20)

# sort by index
wb_data.sort_index(inplace=True)
wb_data_short.sort_index(inplace=True)
whr_data.sort_index(inplace=True)

In [224]:
# test: are the datasets equal
print(sorted(list(wb_data.index))==sorted(list(whr_data.index)))

True


## Split data into train and test set

In [225]:
# drop everything but life satisfaction ladder score from whr data
whr_scores = whr_data["Ladder score"]

In [226]:
test_size = 30

def split_data(data, gt, test_size):
    """
    split dataset into train and test set
    
    returns: tuple of numpy arrays (train_set, test_set)
    """
    test_set = data.sample(n=test_size)
    test_country_names = list(test_set.index.values)
    train_set = data.drop(labels=test_country_names)
    
    test_gt = gt.loc[test_set.index.values]
    train_gt = gt.drop(labels=test_country_names)
    
    return train_set, test_set, train_gt, test_gt

train, test, train_gt, test_gt = split_data(wb_data_short, whr_scores, test_size)

In [227]:
print(train.shape, test.shape, train_gt.shape, test_gt.shape)
train

(120, 120) (30, 120) (120,) (30,)


Unnamed: 0_level_0,Access to electricity (% of population),"Access to electricity, urban (% of urban population)",Adjusted savings: energy depletion (current US$),Adjusted savings: mineral depletion (current US$),"Adolescent fertility rate (births per 1,000 women ages 15-19)","Age dependency ratio, old (% of working-age population)","Age dependency ratio, young (% of working-age population)","Birth rate, crude (per 1,000 people)","Contributing family workers, female (% of female employment) (modeled ILO estimate)","Contributing family workers, male (% of male employment) (modeled ILO estimate)",...,"Unemployment, female (% of female labor force) (modeled ILO estimate)","Unemployment, male (% of male labor force) (modeled ILO estimate)","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)",Urban population (% of total population),Urban population growth (annual %),"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,97.700000,100.000000,2.074859e+07,0.000000e+00,61.3250,4.763616,77.346155,31.802,67.430000,11.32,...,13.81,10.19,20.90,16.00,25.754,3.323827,89.470001,76.699997,8.210000,20.330000
Albania,100.000000,100.000000,1.246877e+08,1.783617e+06,19.5028,20.764736,25.438786,11.620,29.440001,16.76,...,11.31,11.58,25.85,27.74,61.229,1.071414,51.050001,51.300001,47.869999,44.209999
Algeria,99.500000,99.800000,1.700532e+10,2.857004e+06,9.5966,10.418294,48.572197,23.583,2.310000,1.78,...,20.44,9.67,45.55,26.41,73.189,2.702067,23.460000,28.430000,74.519997,66.440002
Argentina,100.000000,100.000000,6.810015e+09,1.618795e+08,62.3488,17.523175,38.333859,16.833,0.820000,0.37,...,10.70,9.19,28.83,24.03,91.991,1.125020,21.010001,23.980001,76.599998,71.190002
Armenia,100.000000,100.000000,0.000000e+00,1.198809e+08,20.2820,16.949985,30.650660,13.646,1.950000,0.73,...,20.27,17.65,39.76,29.02,63.219,0.313404,29.200000,36.290001,70.480003,62.340000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Venezuela, RB",100.000000,100.000000,4.441300e+08,1.387500e+07,84.6214,11.711958,42.170218,17.566,2.670000,2.73,...,7.72,6.83,20.93,13.54,88.240,-1.257364,42.390001,42.130002,57.070000,55.930000
Vietnam,99.400000,100.000000,1.924669e+09,1.094470e+08,27.3746,10.910588,33.530246,16.454,19.080000,9.16,...,2.01,2.07,6.71,6.59,36.628,2.908946,57.760000,47.290001,41.099998,49.919998
"Yemen, Rep.",72.751701,93.146530,2.302761e+08,0.000000e+00,57.9728,5.014543,67.773174,29.873,32.130001,10.21,...,25.25,11.85,34.11,23.34,37.273,4.008005,60.650002,47.120000,38.430000,45.430000
Zambia,43.000000,79.878265,7.678265e+05,2.757511e+08,116.4976,3.959618,83.228564,35.776,35.730000,9.45,...,13.09,10.81,24.19,21.08,44.072,4.151133,81.970001,65.240001,17.820000,34.509998


## Linear regression

In [228]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(train, train_gt)
test_pred = reg.predict(test)
loss = sklearn.metrics.mean_squared_error(test_gt, test_pred)
print(loss)
print(test_pred, test_gt)
print(test.index)

4.101402661901457
[5.93505542 3.4660969  6.2748472  4.01934121 5.12010974 2.02424353
 7.49376807 6.62446315 4.71059348 1.77895637 3.5861384  4.16052915
 4.82751824 5.58571963 7.4716147  8.40043704 5.11823344 1.991149
 5.19681946 6.88820366 7.2282487  5.415718   0.39645634 5.80299256
 6.81258039 7.97066901 4.24613962 1.67137048 6.02883534 2.43347223] Country name
Bosnia and Herzegovina    5.6741
Nepal                     5.1372
Latvia                    5.9500
Brazil                    6.3756
Thailand                  5.9988
Hong Kong SAR, China      5.5104
United States             6.9396
Finland                   7.8087
Congo, Dem. Rep.          4.3110
Singapore                 6.3771
Morocco                   5.0948
Australia                 7.2228
Niger                     4.9096
Greece                    5.5150
Liberia                   4.5579
Eswatini                  4.3081
Mongolia                  5.4562
Jamaica                   5.8898
Costa Rica                7.1214
Czech Re