# Notebook 2: Conducting and Evaluating Regression Analysis

In [27]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd
import sklearn

np.random.seed(1)

Import datasets that were preprocessed in Notebook 1

In [28]:
wb_data = pd.read_csv("data/wb_data.csv")
wb_data_short = pd.read_csv("data/wb_data_short.csv")
whr_data = pd.read_csv("data/whr_data.csv")

wb_data.index = wb_data["Country Name"]
wb_data.drop(columns=["Country Name", "Country Name.1"], inplace=True)
wb_data_short.index = wb_data_short["Country Name"]
wb_data_short.drop(columns=["Country Name", "Country Name.1"], inplace=True)


whr_data.index = whr_data["Country name"]
whr_data.drop(columns=["Country name", "Country name.1"], inplace=True)
#whr_data.head(20)

# sort by index
wb_data.sort_index(inplace=True)
wb_data_short.sort_index(inplace=True)
whr_data.sort_index(inplace=True)

In [29]:
# test: are the datasets equal
print(sorted(list(wb_data.index))==sorted(list(whr_data.index)))

True


## Split data into train and test set

In [30]:
# drop everything but life satisfaction ladder score from whr data
whr_scores = whr_data["Ladder score"]

In [31]:
test_size = 30

def split_data(data, gt, test_size):
    """
    split dataset into train and test set
    
    returns: tuple of numpy arrays (train_set, test_set)
    """
    test_set = data.sample(n=test_size)
    test_country_names = list(test_set.index.values)
    train_set = data.drop(labels=test_country_names)
    
    test_gt = gt.loc[test_set.index.values]
    train_gt = gt.drop(labels=test_country_names)
    
    return train_set, test_set, train_gt, test_gt

train, test, train_gt, test_gt = split_data(wb_data_short, whr_scores, test_size)

In [32]:
print(train.shape, test.shape, train_gt.shape, test_gt.shape)
print(list(test.index)==list(test_gt.index))

(120, 120) (30, 120) (120,) (30,)
True


## Linear regression

In [48]:
from sklearn.linear_model import LinearRegression

def n_fold_ceval(n, data, gt, test_size, loss):
    """
    perform n-fold validation
    
    args: number of validations (n), dataset of indicators (data), groundtruth data (gt), size of test set (test size), loss function (loss)
    returns: list of length n, each entry contains loss for one validation loop
    """
    loss_list = []
    for i in range(0,n):
        train, test, train_gt, test_gt = split_data(data, gt, test_size)
        
        reg = LinearRegression().fit(train, train_gt)
        test_pred = reg.predict(test)
        loss = sklearn.metrics.mean_squared_error(test_gt, test_pred)
        loss_list.append(loss)
        
    return loss_list

Let's see how linear regression performs on wb_data and wb_data_short (redundant indicators removed).

In [48]:
losses = np.array(n_fold_ceval(1500, wb_data, whr_scores, 30))
losses_mean = losses.mean()

print(losses_mean)
print(losses[0:50])

14.945441120476232
[  6.51939572   6.26261574   7.58159913   1.63483737   5.97839956
   4.87304204   2.66028739   7.90599941   7.39089931  34.86942516
   2.26733357   3.66616242  10.3733022    2.6190404    6.72689131
 288.54859496   9.37142456   3.43677257  25.49719684   2.14758935
   7.67526786  44.05844199   9.77266899  29.71788408  13.15540793
   6.7018282    6.4644629   13.77506163   3.00843642   4.48618897
   4.94498779   2.26956486  40.50720183  62.64376693   3.68594704
  12.99721999  13.45737186   9.21572372  11.1891592    9.03848938
  57.69502699   8.30333794  82.60712957   2.82862587  20.08378955
  10.52933585   4.70173479   1.62237674   2.69534614   9.85536205]


For high n, the mse-loss is around 5. We print the first 50 entries of the loss array to check for outliers. It turns out that the variance is quite large and the loss is roughly in a range of [1, 25]