# Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import r2_score

In [2]:
df = pd.read_pickle('data/happiness_data.pkl')

In [3]:
years_train = [2015, 2016]
years_test = [2017]
train_features = df.loc[df['Year'].isin(years_train)]
test_features = df.loc[df['Year'].isin(years_test)]
train_features.reset_index(inplace=True, drop=True)
test_features.reset_index(inplace=True, drop=True)

In [4]:
df_train = train_features.drop(columns=['Country', 'Year', 'Score', 'Low', 'High', 'Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia'], axis=1)
df_test = test_features.drop(columns=['Country', 'Year', 'Score', 'Low', 'High', 'Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia'], axis=1)

In [5]:
df_economy_train = pd.read_pickle('data/economy_train.pkl')
df_family_train = pd.read_pickle('data/family_train.pkl')
df_health_train = pd.read_pickle('data/health_train.pkl')
df_freedom_train = pd.read_pickle('data/freedom_train.pkl')
df_trust_train = pd.read_pickle('data/trust_train.pkl')
df_generosity_train = pd.read_pickle('data/generosity_train.pkl')
df_dystopia_train = pd.read_pickle('data/dystopia_train.pkl')

df_economy_test = pd.read_pickle('data/economy_test.pkl')
df_family_test = pd.read_pickle('data/family_test.pkl')
df_health_test = pd.read_pickle('data/health_test.pkl')
df_freedom_test = pd.read_pickle('data/freedom_test.pkl')
df_trust_test = pd.read_pickle('data/trust_test.pkl')
df_generosity_test = pd.read_pickle('data/generosity_test.pkl')
df_dystopia_test = pd.read_pickle('data/dystopia_test.pkl')

In [6]:
#Fetching training and test data set
score_train = pd.DataFrame(train_features['Score'])
economy_train = pd.DataFrame(train_features['Economy'])
family_train = pd.DataFrame(train_features['Family'])
health_train = pd.DataFrame(train_features['Health'])
freedom_train = pd.DataFrame(train_features['Freedom'])
trust_train = pd.DataFrame(train_features['Trust'])
generosity_train = pd.DataFrame(train_features['Generosity'])
dystopia_train = pd.DataFrame(train_features['Dystopia'])

score_test = pd.DataFrame(test_features['Score'])
economy_test = pd.DataFrame(test_features['Economy'])
family_test = pd.DataFrame(test_features['Family'])
health_test = pd.DataFrame(test_features['Health'])
freedom_test = pd.DataFrame(test_features['Freedom'])
trust_test = pd.DataFrame(test_features['Trust'])
generosity_test = pd.DataFrame(test_features['Generosity'])
dystopia_test = pd.DataFrame(test_features['Dystopia'])

low_score = pd.DataFrame(test_features['Low'])
high_score = pd.DataFrame(test_features['High'])

In [7]:
#Training the model to predict Economy
lr = linear_model.LinearRegression()
lr.fit(df_economy_train, economy_train.values.ravel())
economy_predictions = lr.predict(df_economy_test)
errors = abs(economy_predictions - economy_test.values.ravel())
accuracy = r2_score(economy_test.values.ravel(), economy_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.7757196637300185


array([0.11508091, 0.0566815 , 0.04460663, 0.07906117, 0.06049981,
       0.06894523, 0.03450263, 0.0724066 , 0.10572364, 0.27504105,
       0.26433636, 0.25867558, 0.25940505, 0.04241283, 0.03903602,
       0.19294306, 0.0227607 , 0.08753366, 0.2837745 , 0.12685154,
       0.08246193, 0.4179543 , 0.07897435, 0.12296542, 0.38077574,
       0.24610566, 0.01259532, 0.43222962, 0.09684152, 0.1617135 ,
       0.20656241, 0.00822341, 0.14541038, 0.144284  , 0.228473  ,
       0.126298  , 0.01305919, 0.0091733 , 0.52680178, 0.02835293,
       0.06531478, 0.16755938, 0.32450911, 0.19351269, 0.25383744,
       0.09388859, 0.27896585, 0.11896044, 0.07644626, 0.2827166 ,
       0.20815629, 0.24387854, 0.06144475, 0.0549592 , 0.09116221,
       0.02546406, 0.02924708, 0.03622096, 0.09414422, 0.31097178,
       0.24397749, 0.08126087, 0.15671542, 0.10084692, 0.01907614,
       0.09429327, 0.03257104, 0.04010145, 0.42172312, 0.08505269,
       0.10841169, 0.22405716, 0.20921015, 0.17920947, 0.10170

In [8]:
#Training the model to predict Family
lr = linear_model.LinearRegression()
lr.fit(df_family_train, family_train.values.ravel())
family_predictions = lr.predict(df_family_test)
errors = abs(family_predictions - family_test.values.ravel())
accuracy = r2_score(family_test.values.ravel(), family_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  -0.09689154472440009


array([0.10760838, 0.30622206, 0.08225006, 0.33495305, 0.37084426,
       0.01049055, 0.30148427, 0.22411104, 0.06087513, 0.11954008,
       0.16155102, 0.23097784, 0.20908552, 0.22210645, 0.37841084,
       0.36937876, 0.08334585, 0.32459256, 0.36082638, 0.25658279,
       0.28623124, 0.2052246 , 0.07518379, 0.16382529, 0.30257977,
       0.20177721, 0.07013785, 0.01960357, 0.33340778, 0.32473008,
       0.1886832 , 0.02143237, 0.19441381, 0.32874595, 0.53019078,
       0.17829248, 0.13619732, 0.21021246, 0.16005063, 0.33884239,
       0.14703162, 0.16530201, 0.51371567, 0.23994345, 0.11935006,
       0.10924678, 0.27435499, 0.0298326 , 0.10770801, 0.11902217,
       0.13691728, 0.33141574, 0.09213059, 0.22209338, 0.06315646,
       0.27505947, 0.17327707, 0.24592809, 0.33515621, 0.22418632,
       0.18420422, 0.2171397 , 0.26190782, 0.10641944, 0.01443872,
       0.28358617, 0.08038175, 0.28254497, 0.64973008, 0.29063343,
       0.1409688 , 0.01998147, 0.30247101, 0.07458985, 0.64532

In [9]:
#Training the model to predict Health
lr = linear_model.LinearRegression()
lr.fit(df_health_train, health_train.values.ravel())
health_predictions = lr.predict(df_health_test)
errors = abs(health_predictions - health_test.values.ravel())
accuracy = r2_score(health_test.values.ravel(), health_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.8543852946016751


array([1.56314808e-02, 2.44326889e-02, 1.96322372e-02, 1.22834311e-01,
       4.71923014e-03, 4.53208589e-02, 6.33455613e-02, 4.81165359e-02,
       5.61271592e-02, 1.00819408e-01, 2.71125019e-02, 1.10984014e-01,
       4.44713489e-02, 1.31884591e-01, 3.04283373e-02, 6.85419892e-03,
       1.23640705e-02, 1.05026464e-01, 5.39193430e-02, 4.70190869e-02,
       1.30048428e-02, 1.82093936e-01, 1.48789858e-01, 8.22609334e-02,
       5.24517231e-02, 8.30164493e-02, 3.99249957e-02, 6.87456742e-02,
       6.31178563e-02, 4.50002864e-02, 4.23935928e-02, 7.00769366e-02,
       4.71678586e-02, 1.78635138e-03, 9.54443617e-04, 1.47912037e-02,
       8.75395421e-02, 1.30110584e-01, 2.06364701e-02, 4.58413996e-02,
       6.90920586e-02, 7.36479500e-02, 6.10808717e-02, 5.14409866e-04,
       1.01553288e-01, 3.63819443e-02, 7.08185531e-02, 5.42469725e-02,
       8.39492777e-03, 5.75578210e-02, 8.33082366e-02, 2.00274908e-02,
       8.92637843e-04, 1.22459832e-01, 4.79890052e-02, 4.64422457e-03,
      

In [10]:
#Training the model to predict Freedom
lr = linear_model.LinearRegression()
lr.fit(df_freedom_train, freedom_train.values.ravel())
freedom_predictions = lr.predict(df_freedom_test)
errors = abs(freedom_predictions - freedom_test.values.ravel())
accuracy = r2_score(freedom_test.values.ravel(), freedom_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.25376855845231483


array([0.06045957, 0.01635336, 0.06715178, 0.22637094, 0.11398269,
       0.09305501, 0.12893615, 0.07398153, 0.07385893, 0.02969575,
       0.01749424, 0.13693013, 0.11538364, 0.09193158, 0.03432712,
       0.13303768, 0.10365447, 0.18944188, 0.09276518, 0.15151565,
       0.06253456, 0.23143793, 0.2404751 , 0.12381725, 0.20720485,
       0.07789776, 0.12811945, 0.05677043, 0.0794816 , 0.22006837,
       0.13986366, 0.03675535, 0.03525319, 0.13729393, 0.12383654,
       0.01352629, 0.0216843 , 0.01685474, 0.03949568, 0.18231179,
       0.05268209, 0.01085659, 0.06172807, 0.0735134 , 0.21733236,
       0.14185056, 0.10802708, 0.02147169, 0.36648888, 0.12541721,
       0.33961568, 0.06193261, 0.00916729, 0.02275224, 0.13333105,
       0.05158009, 0.08214893, 0.14060436, 0.08913246, 0.01749591,
       0.01873944, 0.00974024, 0.32497953, 0.01055797, 0.05766584,
       0.09000262, 0.02932742, 0.10660392, 0.09017941, 0.24066316,
       0.14504035, 0.16958563, 0.058009  , 0.12013547, 0.07635

In [11]:
#Training the model to predict Trust
lr = linear_model.LinearRegression()
lr.fit(df_trust_train, trust_train.values.ravel())
trust_predictions = lr.predict(df_trust_test)
errors = abs(trust_predictions - trust_test.values.ravel())
accuracy = r2_score(trust_test.values.ravel(), trust_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.2562066409953969


array([0.01358488, 0.02632671, 0.02106539, 0.07857574, 0.05987267,
       0.06764476, 0.12459797, 0.03244395, 0.06915694, 0.01023142,
       0.03778273, 0.08875948, 0.10677674, 0.03049994, 0.01044977,
       0.02093475, 0.05094673, 0.1247421 , 0.04670786, 0.07124717,
       0.00517976, 0.01107576, 0.05199686, 0.06119486, 0.16056828,
       0.06016517, 0.08463935, 0.06614398, 0.10031544, 0.01217274,
       0.07336383, 0.09638334, 0.16663092, 0.18004734, 0.08013059,
       0.03119768, 0.06754102, 0.00277062, 0.01839477, 0.2394535 ,
       0.00871897, 0.17157774, 0.10819929, 0.06399891, 0.0877122 ,
       0.01025895, 0.02027845, 0.06858541, 0.01271205, 0.03322388,
       0.11505505, 0.00034863, 0.09946617, 0.13160367, 0.04977749,
       0.03447238, 0.05767461, 0.08748711, 0.01049936, 0.05870291,
       0.04988039, 0.10326013, 0.05869572, 0.06088593, 0.05072289,
       0.03400385, 0.23307126, 0.01548918, 0.01496185, 0.08067893,
       0.01061508, 0.14647586, 0.06574907, 0.15060882, 0.01275

In [12]:
#Training the model to predict Generosity
lr = linear_model.LinearRegression()
lr.fit(df_generosity_train, generosity_train.values.ravel())
generosity_predictions = lr.predict(df_generosity_test)
errors = abs(generosity_predictions - generosity_test.values.ravel())
accuracy = r2_score(generosity_test.values.ravel(), generosity_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  0.11032293421292771


array([0.15065629, 0.00307658, 0.14535256, 0.09423806, 0.08203847,
       0.06090423, 0.22160141, 0.06753393, 0.21354295, 0.1229688 ,
       0.09411671, 0.0363078 , 0.00653352, 0.11740193, 0.14819406,
       0.05271548, 0.11482983, 0.16784316, 0.01187722, 0.13196062,
       0.02451961, 0.05613494, 0.10864438, 0.00344559, 0.18229186,
       0.01948577, 0.05447983, 0.25357757, 0.12583285, 0.00595323,
       0.05756471, 0.07649755, 0.19780595, 0.07613485, 0.07651385,
       0.11334026, 0.17625772, 0.20206419, 0.03206416, 0.01853784,
       0.12411514, 0.13180587, 0.08600808, 0.06851495, 0.12293759,
       0.13530439, 0.013391  , 0.03330976, 0.16993039, 0.03548237,
       0.23051373, 0.11274514, 0.07881   , 0.33154984, 0.06374954,
       0.13059918, 0.08341009, 0.02573927, 0.03932445, 0.13638031,
       0.04855981, 0.06755375, 0.31496801, 0.17326583, 0.00354384,
       0.02275544, 0.06187538, 0.087389  , 0.07405569, 0.26653372,
       0.00437869, 0.09271679, 0.06522904, 0.09498283, 0.11509

In [13]:
#Training the model to predict Dystopia
lr = linear_model.LinearRegression()
lr.fit(df_dystopia_train, dystopia_train.values.ravel())
dystopia_predictions = lr.predict(df_dystopia_test)
errors = abs(dystopia_predictions - dystopia_test.values.ravel())
accuracy = r2_score(dystopia_test.values.ravel(), dystopia_predictions)
print('R2 score: ', accuracy)
errors

R2 score:  -0.440846356133868


array([0.1707121 , 0.67971817, 0.31836648, 0.59244134, 0.24631792,
       0.69288623, 0.63189538, 0.06133437, 0.46557958, 0.49572894,
       0.19544509, 0.44448934, 0.1560283 , 0.17297605, 0.94754328,
       0.04041504, 0.25658456, 1.85487168, 0.00772225, 1.00956311,
       0.36082371, 0.34476405, 0.90890286, 0.19783334, 0.21330874,
       0.08529594, 0.28985401, 0.92929615, 0.27165533, 0.57333252,
       0.44961997, 0.60467401, 0.319217  , 0.1513045 , 0.91159637,
       0.06331004, 0.52679135, 0.70660165, 0.38427994, 0.11341296,
       0.26872727, 1.00187194, 0.77219292, 0.13067416, 0.69825214,
       0.57089202, 0.78135759, 0.32072104, 0.4170693 , 0.08462013,
       0.45458595, 0.24221405, 0.80512063, 0.85714727, 0.89400406,
       0.20262678, 0.51335846, 0.49402163, 0.60857541, 0.83591949,
       0.3006196 , 0.49989264, 0.65040494, 0.24299328, 0.63045298,
       0.29125956, 0.64864918, 0.56681976, 0.55719362, 0.04593486,
       0.06409514, 0.37296064, 0.16197559, 0.02129884, 0.44447

In [14]:
#Predicting Score
predictions = economy_predictions + family_predictions + health_predictions + freedom_predictions + trust_predictions + generosity_predictions + dystopia_predictions
errors = abs(predictions - score_test.values.ravel())
accuracy = r2_score(score_test.values.ravel(), predictions)
print('R2 score: ', accuracy)
mape = 100 * (errors / score_test.values.ravel())
accuracy = 100 - np.mean(mape)
print('Accuracy: ', accuracy)
errors

R2 score:  0.5713326044845852
Accuracy:  87.99292933557405


array([0.11698117, 0.91424298, 0.23114692, 0.70028413, 0.65484758,
       1.01783566, 0.17352242, 0.45708332, 0.92278604, 0.28527168,
       0.63318169, 0.6679514 , 0.884604  , 0.276542  , 0.4452049 ,
       0.28303607, 0.41520129, 1.65136162, 0.14263129, 1.02752755,
       0.11133982, 1.44822053, 0.76449312, 0.21575413, 1.07217581,
       0.20443591, 0.25451235, 1.82611335, 0.29836498, 0.99508837,
       1.0431203 , 0.60431643, 0.07843657, 1.01618929, 0.18441842,
       0.03092177, 0.43177523, 0.8339733 , 0.64044566, 0.90984382,
       0.04961073, 1.37079018, 1.71103353, 0.50964695, 0.68141658,
       0.61913192, 1.37903388, 0.58703863, 0.80209389, 0.49950854,
       1.29458765, 1.01285853, 0.94620535, 0.63543009, 1.06111015,
       0.30995818, 0.72622763, 0.38688359, 0.27061679, 0.31262044,
       0.09082685, 0.45047102, 0.06540954, 0.23292668, 0.63343826,
       0.37505618, 0.70202613, 0.12251604, 0.38486754, 0.43689187,
       0.39852977, 1.05646122, 0.4708034 , 0.07971428, 0.25564

In [15]:
#Find accuracy
rows = len(predictions)
tp = 0
for i in range(rows):
    if(predictions[i] >= low_score.loc[i, 'Low'] and predictions[i] <= high_score.loc[i, 'High']):
        tp = tp + 1
print('Correct margin: ', tp/rows*100)

Correct margin:  6.870229007633588
