# Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import r2_score

In [2]:
df = pd.read_pickle('data/happiness_data.pkl')

In [3]:
years_train = [2015, 2016]
years_test = [2017]
train_features = df.loc[df['Year'].isin(years_train)]
test_features = df.loc[df['Year'].isin(years_test)]
train_features.reset_index(inplace=True, drop=True)
test_features.reset_index(inplace=True, drop=True)

In [4]:
df_train = train_features.drop(columns=['Country', 'Year', 'Score', 'Low', 'High', 'Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia'], axis=1)
df_test = test_features.drop(columns=['Country', 'Year', 'Score', 'Low', 'High', 'Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia'], axis=1)

In [5]:
df_economy_train = pd.read_pickle('data/economy_train.pkl')
df_family_train = pd.read_pickle('data/family_train.pkl')
df_health_train = pd.read_pickle('data/health_train.pkl')
df_freedom_train = pd.read_pickle('data/freedom_train.pkl')
df_trust_train = pd.read_pickle('data/trust_train.pkl')
df_generosity_train = pd.read_pickle('data/generosity_train.pkl')
df_dystopia_train = pd.read_pickle('data/dystopia_train.pkl')

df_economy_test = pd.read_pickle('data/economy_test.pkl')
df_family_test = pd.read_pickle('data/family_test.pkl')
df_health_test = pd.read_pickle('data/health_test.pkl')
df_freedom_test = pd.read_pickle('data/freedom_test.pkl')
df_trust_test = pd.read_pickle('data/trust_test.pkl')
df_generosity_test = pd.read_pickle('data/generosity_test.pkl')
df_dystopia_test = pd.read_pickle('data/dystopia_test.pkl')

In [6]:
#Fetching training and test data set
score_train = pd.DataFrame(train_features['Score'])
economy_train = pd.DataFrame(train_features['Economy'])
family_train = pd.DataFrame(train_features['Family'])
health_train = pd.DataFrame(train_features['Health'])
freedom_train = pd.DataFrame(train_features['Freedom'])
trust_train = pd.DataFrame(train_features['Trust'])
generosity_train = pd.DataFrame(train_features['Generosity'])
dystopia_train = pd.DataFrame(train_features['Dystopia'])

score_test = pd.DataFrame(test_features['Score'])
economy_test = pd.DataFrame(test_features['Economy'])
family_test = pd.DataFrame(test_features['Family'])
health_test = pd.DataFrame(test_features['Health'])
freedom_test = pd.DataFrame(test_features['Freedom'])
trust_test = pd.DataFrame(test_features['Trust'])
generosity_test = pd.DataFrame(test_features['Generosity'])
dystopia_test = pd.DataFrame(test_features['Dystopia'])

low_score = pd.DataFrame(test_features['Low'])
high_score = pd.DataFrame(test_features['High'])

In [7]:
#Training the model to predict Economy
lr = linear_model.LinearRegression()
lr.fit(df_economy_train, economy_train.values.ravel())
economy_predictions = lr.predict(df_economy_test)
errors = abs(economy_predictions - economy_test.values.ravel())
accuracy = r2_score(economy_test.values.ravel(), economy_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.5498024913843389


array([0.52843646, 0.01211164, 0.19137564, 0.24109757, 0.06483178,
       0.16629932, 0.00133748, 0.01873091, 0.29540194, 0.36212646,
       0.29825416, 0.29870644, 0.34543753, 0.19166525, 0.17464699,
       0.26810358, 0.18156471, 0.02965694, 0.1082    , 0.37530448,
       0.03542983, 0.43342168, 0.13741881, 0.51061768, 0.74415102,
       0.51862341, 0.03429014, 0.28900729, 0.13394786, 0.08540071,
       0.23733851, 0.02467048, 0.15918004, 0.03466059, 0.00527845,
       0.31616614, 0.08984589, 0.33810107, 0.5424277 , 0.07002363,
       0.16834374, 0.27645488, 0.31182933, 0.24682572, 0.49774891,
       0.10818721, 0.33225467, 0.17030927, 0.12954564, 0.33396441,
       0.02894786, 0.29866938, 0.04351354, 0.02188362, 0.1488242 ,
       0.06734045, 0.1245433 , 0.07110422, 0.18344575, 0.13746662,
       0.05132479, 0.19599738, 0.3462851 , 0.27609946, 0.14372236,
       0.10531293, 0.07241949, 0.44823336, 0.44068506, 0.01819545,
       0.27661181, 0.20357475, 0.43984078, 0.42921518, 0.37076

In [8]:
#Training the model to predict Family
lr = linear_model.LinearRegression()
lr.fit(df_family_train, family_train.values.ravel())
family_predictions = lr.predict(df_family_test)
errors = abs(family_predictions - family_test.values.ravel())
accuracy = r2_score(family_test.values.ravel(), family_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  -1.0919179534002383


array([0.21422708, 0.01180241, 0.2844878 , 0.40070823, 0.45210009,
       0.12684172, 0.1129137 , 0.36786535, 0.26365662, 0.45435062,
       0.14606912, 0.49972093, 0.60820183, 0.05185439, 0.48684677,
       0.39229314, 0.27753012, 0.50375132, 0.4185252 , 0.53317713,
       0.4248936 , 0.13396351, 0.31440984, 0.2803498 , 0.41256327,
       0.3347344 , 0.2681623 , 0.37950637, 0.60445422, 0.54523281,
       0.03732129, 0.18949118, 0.32580373, 0.45014426, 0.56860491,
       0.39053312, 0.31540684, 0.42741678, 0.14800714, 0.52435014,
       0.38783267, 0.23816526, 0.39524508, 0.4954442 , 0.15140456,
       0.33716227, 0.49462693, 0.18005185, 0.08212418, 0.16296483,
       0.25321213, 0.49616806, 0.43075305, 0.36956691, 0.12606261,
       0.39107991, 0.25625229, 0.44927112, 0.58116559, 0.41595047,
       0.43688122, 0.41768644, 0.47260047, 0.40236476, 0.22390965,
       0.25338498, 0.32597229, 0.26187158, 0.34855924, 0.42454662,
       0.55444266, 0.41671246, 0.00179836, 0.37228744, 0.44269

In [9]:
#Training the model to predict Health
lr = linear_model.LinearRegression()
lr.fit(df_health_train, health_train.values.ravel())
health_predictions = lr.predict(df_health_test)
errors = abs(health_predictions - health_test.values.ravel())
accuracy = r2_score(health_test.values.ravel(), health_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.8511355242011551


array([0.1086192 , 0.04024063, 0.03074141, 0.13519958, 0.00661738,
       0.04479138, 0.04379464, 0.00115169, 0.09597517, 0.06798074,
       0.02539205, 0.10991463, 0.0613128 , 0.12075352, 0.08726009,
       0.07490301, 0.03000144, 0.12131533, 0.02096762, 0.00709991,
       0.06859751, 0.14342228, 0.19038822, 0.11411089, 0.0398257 ,
       0.04506312, 0.0313523 , 0.08435044, 0.05197862, 0.0436802 ,
       0.06922358, 0.04905411, 0.0856859 , 0.03017224, 0.07500694,
       0.03384906, 0.13747636, 0.15674089, 0.07493815, 0.01451149,
       0.03589585, 0.09977148, 0.05313305, 0.02244277, 0.1519381 ,
       0.05752881, 0.05960686, 0.03218195, 0.01208101, 0.12281683,
       0.11651021, 0.01119814, 0.01457319, 0.1033978 , 0.12164284,
       0.01698482, 0.02473466, 0.06993076, 0.02771333, 0.07788863,
       0.05116303, 0.03006447, 0.03319367, 0.06296757, 0.15074087,
       0.19216138, 0.12518175, 0.01954213, 0.15196979, 0.14909307,
       0.00818032, 0.02932281, 0.04475725, 0.15401315, 0.03431

In [10]:
#Training the model to predict Freedom
lr = linear_model.LinearRegression()
lr.fit(df_freedom_train, freedom_train.values.ravel())
freedom_predictions = lr.predict(df_freedom_test)
errors = abs(freedom_predictions - freedom_test.values.ravel())
accuracy = r2_score(freedom_test.values.ravel(), freedom_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.128381495546237


array([0.04391005, 0.11444813, 0.0067218 , 0.14816279, 0.15723807,
       0.07398749, 0.10629139, 0.14463995, 0.03527879, 0.10686671,
       0.09320583, 0.05231925, 0.14093744, 0.10106741, 0.08646286,
       0.21302387, 0.0349177 , 0.27608904, 0.10090493, 0.13886515,
       0.02557741, 0.19081841, 0.27391411, 0.19416589, 0.15788186,
       0.03399811, 0.04616543, 0.01245663, 0.13324256, 0.28494682,
       0.09759367, 0.02321516, 0.04845909, 0.13936189, 0.18094684,
       0.03429895, 0.01707657, 0.01050826, 0.06874868, 0.2341152 ,
       0.09260156, 0.04743143, 0.05398392, 0.09713879, 0.24609805,
       0.03702356, 0.21651059, 0.09516197, 0.25701424, 0.0199765 ,
       0.2507843 , 0.11949128, 0.24319911, 0.14879962, 0.06721618,
       0.09996776, 0.03734821, 0.03586043, 0.22243087, 0.12676382,
       0.16773219, 0.02401675, 0.41368348, 0.15597739, 0.22082985,
       0.00677993, 0.00847945, 0.14236498, 0.00978906, 0.12399604,
       0.1406299 , 0.16775231, 0.11668506, 0.04745732, 0.02942

In [11]:
#Training the model to predict Trust
lr = linear_model.LinearRegression()
lr.fit(df_trust_train, trust_train.values.ravel())
trust_predictions = lr.predict(df_trust_test)
errors = abs(trust_predictions - trust_test.values.ravel())
accuracy = r2_score(trust_test.values.ravel(), trust_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.12917048829864286


array([1.65578582e-01, 5.48526739e-02, 2.52880530e-02, 1.38444798e-02,
       4.93553071e-02, 4.67135115e-02, 1.07515632e-01, 7.34020544e-02,
       6.76500873e-02, 1.00463590e-01, 6.91392355e-02, 6.23862128e-02,
       1.30328829e-01, 3.43254232e-02, 7.83606003e-02, 4.12611940e-02,
       1.57495384e-02, 3.09580600e-04, 8.21122451e-02, 7.11515219e-02,
       6.39589464e-02, 2.78682446e-02, 6.01409692e-02, 3.42491078e-02,
       3.82442858e-02, 5.03217405e-02, 5.83989357e-03, 3.19351301e-02,
       6.51923833e-03, 7.02524530e-02, 9.85775415e-03, 8.05646575e-02,
       1.66859834e-01, 2.41875085e-01, 1.04229488e-02, 2.74476693e-02,
       4.81961960e-02, 2.98630166e-03, 7.83133256e-02, 2.90255421e-01,
       4.19695219e-02, 1.22828215e-02, 1.60832039e-01, 1.19765472e-01,
       2.70723927e-02, 8.37933086e-02, 4.15174957e-02, 2.55882019e-02,
       1.20247044e-01, 4.75197764e-04, 5.92466988e-02, 4.96658218e-02,
       1.96684581e-01, 1.40381927e-02, 1.57911917e-01, 1.24669981e-01,
      

In [12]:
#Training the model to predict Generosity
lr = linear_model.LinearRegression()
lr.fit(df_generosity_train, generosity_train.values.ravel())
generosity_predictions = lr.predict(df_generosity_test)
errors = abs(generosity_predictions - generosity_test.values.ravel())
accuracy = r2_score(generosity_test.values.ravel(), generosity_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.14415855736692418


array([0.13566622, 0.09566784, 0.1449389 , 0.06287936, 0.0695991 ,
       0.08181328, 0.17449395, 0.07719838, 0.14531315, 0.06484085,
       0.02401252, 0.01095544, 0.02427624, 0.03671088, 0.23693112,
       0.00094794, 0.14923795, 0.10622881, 0.02689519, 0.14775807,
       0.02054769, 0.06759186, 0.11342378, 0.02284789, 0.11763992,
       0.04975437, 0.07236471, 0.120045  , 0.01136864, 0.08965232,
       0.12162666, 0.10704636, 0.172167  , 0.09780706, 0.04266177,
       0.06226024, 0.09259215, 0.13733009, 0.10513926, 0.08509187,
       0.04340504, 0.04607111, 0.08068725, 0.10331419, 0.12089748,
       0.153441  , 0.0027528 , 0.00412596, 0.21640139, 0.03164364,
       0.23844623, 0.01214416, 0.04908497, 0.33205663, 0.23156275,
       0.05486638, 0.04009255, 0.07595191, 0.01049337, 0.18185113,
       0.07248753, 0.04320815, 0.34354856, 0.18984577, 0.01282558,
       0.10540206, 0.06500753, 0.08204588, 0.07913752, 0.27489884,
       0.07528919, 0.16137735, 0.15440505, 0.03805177, 0.10240

In [13]:
#Training the model to predict Dystopia
lr = linear_model.LinearRegression()
lr.fit(df_dystopia_train, dystopia_train.values.ravel())
dystopia_predictions = lr.predict(df_dystopia_test)
errors = abs(dystopia_predictions - dystopia_test.values.ravel())
accuracy = r2_score(dystopia_test.values.ravel(), dystopia_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  -1.7886043969842569


array([0.66358564, 1.11879828, 0.27904172, 0.98837491, 0.1144321 ,
       0.67186776, 0.56058376, 0.51468192, 0.79211387, 0.66852576,
       0.65833303, 0.8743168 , 0.25814989, 0.73861916, 1.31901953,
       0.46570027, 0.40980384, 1.8246546 , 0.04690603, 1.30587334,
       0.27170057, 0.80181682, 1.03856103, 0.16038014, 0.06625357,
       0.60930713, 0.17246617, 0.50460647, 0.22319799, 0.38429572,
       0.64086837, 0.97011902, 0.25275686, 0.39070354, 1.41898781,
       0.50561828, 0.32732604, 1.04720129, 0.40330574, 0.33535172,
       0.80482451, 1.04196101, 0.79113859, 0.70689709, 0.63181696,
       0.69943916, 0.24116169, 0.41382344, 1.05192786, 0.68048791,
       1.04369455, 0.45681514, 1.01735226, 1.24829191, 2.2292705 ,
       0.46844707, 0.00868708, 0.79583255, 1.05039898, 1.58743839,
       1.07315653, 0.5332363 , 0.49530208, 0.78139304, 1.26087252,
       0.55348578, 0.6088518 , 1.24127478, 0.17694338, 0.37244375,
       0.34508926, 0.4340216 , 0.83242572, 0.17198841, 0.69684

In [14]:
#Predicting Score
predictions = economy_predictions + family_predictions + health_predictions + freedom_predictions + trust_predictions + generosity_predictions + dystopia_predictions
errors = abs(predictions - score_test.values.ravel())
accuracy = r2_score(score_test.values.ravel(), predictions)
print('R2 score: ', accuracy)
mape = 100 * (errors / score_test.values.ravel())
accuracy = 100 - np.mean(mape)
print('Accuracy: ', accuracy)
errors

R2 score:  0.5060831866235858
Accuracy:  86.64559011245099


array([7.41036238e-01, 1.10947504e+00, 6.72578777e-01, 1.16099945e+00,
       3.18132067e-01, 9.58200906e-01, 1.69568715e-02, 1.68130519e-01,
       9.61890596e-01, 2.22197193e-01, 6.46915446e-01, 7.62496391e-01,
       1.00377921e+00, 7.62489944e-01, 6.91904316e-01, 1.61237037e-01,
       1.77488400e-01, 1.30277123e+00, 4.92010774e-01, 7.61922603e-01,
       3.67178966e-01, 1.47477432e+00, 4.50056133e-01, 2.53350970e-01,
       1.57617195e+00, 6.14338066e-01, 1.13396821e-01, 6.62640941e-01,
       3.10612363e-01, 1.33254974e+00, 8.76416768e-01, 7.03505976e-01,
       5.59104068e-01, 5.43138177e-01, 7.91881635e-01, 4.66037404e-01,
       2.66774561e-01, 5.83774892e-01, 6.20468831e-01, 7.42745638e-01,
       1.21194066e-01, 1.19143052e+00, 1.52518328e+00, 3.78357340e-01,
       7.35713002e-01, 3.03575427e-01, 1.26947679e+00, 4.45174549e-01,
       1.03179084e+00, 1.02591233e+00, 1.48468769e+00, 5.30818051e-01,
       1.55608177e-01, 5.09263058e-01, 2.37992934e+00, 1.17865693e-01,
      

In [15]:
#Find accuracy
rows = len(predictions)
tp = 0
for i in range(rows):
    if(predictions[i] >= low_score.loc[i, 'Low'] and predictions[i] <= high_score.loc[i, 'High']):
        tp = tp + 1
print('Correct margin: ', tp/rows*100)

Correct margin:  6.106870229007633
