In [1]:
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [2]:
df = pd.read_csv("./data/whr2017.csv")
df.sample(10)

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
139,Angola,140,3.795,3.951642,3.638358,0.858428,1.104412,0.049869,0.0,0.097926,0.06972,1.614482
48,Russia,49,5.963,6.030275,5.895725,1.281778,1.469282,0.547349,0.373783,0.052264,0.032963,2.205607
57,Bolivia,58,5.823,5.903977,5.742023,0.833757,1.227619,0.47363,0.558733,0.225561,0.060478,2.443279
11,Costa Rica,12,7.079,7.168112,6.989888,1.109706,1.416404,0.759509,0.580132,0.214613,0.100107,2.898639
83,Morocco,84,5.235,5.318341,5.151659,0.878115,0.774864,0.597711,0.408158,0.03221,0.087763,2.456189
42,Nicaragua,43,6.071,6.186584,5.955417,0.737299,1.287216,0.653096,0.447552,0.301674,0.130688,2.513931
131,Ukraine,132,4.096,4.18541,4.00659,0.894652,1.394538,0.575904,0.122975,0.270061,0.023029,0.814382
118,Ethiopia,119,4.46,4.542729,4.377271,0.339234,0.864669,0.35341,0.408843,0.312651,0.165456,2.015744
28,Guatemala,29,6.454,6.566874,6.341126,0.872002,1.255585,0.54024,0.531311,0.283488,0.077223,2.893891
147,Liberia,148,3.533,3.653756,3.412244,0.119042,0.872118,0.229918,0.332881,0.26655,0.038948,1.673286


In [3]:
df.describe()

Unnamed: 0,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,78.0,5.354019,5.452326,5.255713,0.984718,1.188898,0.551341,0.408786,0.246883,0.12312,1.850238
std,44.888751,1.13123,1.118542,1.14503,0.420793,0.287263,0.237073,0.149997,0.13478,0.101661,0.500028
min,1.0,2.693,2.864884,2.521116,0.0,0.0,0.0,0.0,0.0,0.0,0.377914
25%,39.5,4.5055,4.608172,4.374955,0.663371,1.042635,0.369866,0.303677,0.154106,0.057271,1.591291
50%,78.0,5.279,5.370032,5.193152,1.064578,1.253918,0.606042,0.437454,0.231538,0.089848,1.83291
75%,116.5,6.1015,6.1946,6.006527,1.318027,1.414316,0.723008,0.516561,0.323762,0.153296,2.144654
max,155.0,7.537,7.62203,7.479556,1.870766,1.610574,0.949492,0.658249,0.838075,0.464308,3.117485


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     155 non-null    object 
 1   rank        155 non-null    int64  
 2   score       155 non-null    float64
 3   high        155 non-null    float64
 4   low         155 non-null    float64
 5   gdp         155 non-null    float64
 6   family      155 non-null    float64
 7   lifexp      155 non-null    float64
 8   freedom     155 non-null    float64
 9   generosity  155 non-null    float64
 10  corruption  155 non-null    float64
 11  dystopia    155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 14.7+ KB


In [6]:
X = df[["gdp", "family", "lifexp", "freedom", "generosity", "corruption", "dystopia"]]
y = df["score"]

print(X.shape)
print(y.shape)

(155, 7)
(155,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
model_linear = LinearRegression().fit(X_train, y_train)

In [9]:
y_predict_linear = model_linear.predict(X_test)
modelLasso = Lasso(alpha=0.2).fit(X_train, y_train)
y_predict_lasso = modelLasso.predict(X_test)

In [10]:
modelRidge = Ridge(alpha=1).fit(X_train, y_train)
y_predict_ridge = modelRidge.predict(X_test)

In [11]:
linear_loss = mean_absolute_error(y_test, y_predict_linear)
lasso_loss = mean_absolute_error(y_test, y_predict_lasso)
ridge_loss = mean_absolute_error(y_test, y_predict_ridge)
print("Linear loss: ", linear_loss)
print("Lasso loss: ", lasso_loss)
print("Ridge loss: ", ridge_loss)

Linear loss:  0.0002783762217941917
Lasso loss:  0.5559475730523403
Ridge loss:  0.062347839959471005


In [12]:
print("Coef Lasso: ", modelLasso.coef_)
print("Coef Ridge: ", modelRidge.coef_)

Coef Lasso:  [1.04336835 0.         0.         0.         0.         0.
 0.3165654 ]
Coef Ridge:  [1.07234856 0.97048582 0.85605399 0.87400159 0.73285696 0.68583271
 0.96206567]
