In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
%run -i columns.py
%run -i helper_functions.py

In [3]:
df = pd.read_csv('tmp/developing_countries.csv')

df['Residual-to-happiness ratio'] = df['Dystopia residual'] / df['Happiness score']

In [4]:
columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS

df = df[columns] 
df

Unnamed: 0,Country,Country code,Dystopia residual,Residual-to-happiness ratio,"Air transport, freight (million ton-km)","Air transport, passengers carried","Annual freshwater withdrawals, industry (% of total freshwater withdrawal)",CO2 emissions (metric tons per capita),"Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),...,"Railways, goods transported (million ton-km)","Railways, passengers carried (million passenger-km)",Urban population (% of total),Individuals using the Internet (% of population),"Educational attainment, competed at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, competed at least Master's or equivalent, population 25+, total (%) (cumulative)","Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)","Annual freshwater withdrawals, agriculture (% of total freshwater withdrawal)",Employment in agriculture (% of total employment) (modeled ILO estimate),Forest area (% of land area)
0,Afghanistan,AFG,1.952100,0.546042,33.102039,1929907.0,,,2.190994,17.646999,...,,,24.803,8.260000,,,921.359625,,38.678001,2.067825
1,Afghanistan,AFG,2.145580,0.638565,29.010881,1917924.0,,,2.138151,17.424999,...,,,25.020,10.595726,,,930.799951,,38.818001,2.067825
2,Afghanistan,AFG,2.150801,0.566895,25.144211,1858558.0,,,2.138151,17.457001,...,,,25.250,11.447688,,,928.592663,,38.773998,
3,Afghanistan,AFG,2.196000,0.604626,,,,,,17.596001,...,,,,,,,,,38.588001,
4,Albania,ALB,1.898940,0.382928,,,,,21.851423,18.617001,...,,,57.434,63.252933,,,5378.217225,,41.362999,28.156934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Zambia,ZMB,1.667000,0.380854,,,,,,10.683000,...,,,,,,,,,53.916000,
453,Zimbabwe,ZWE,2.441910,0.529698,0.796107,370165.0,,,5.165438,7.290000,...,,,32.385,22.742818,,,294.738272,,67.121002,36.350006
454,Zimbabwe,ZWE,2.442700,0.582566,0.804947,378803.0,,,5.260135,7.218000,...,,,32.296,23.119989,,,275.073784,,67.192001,35.542457
455,Zimbabwe,ZWE,1.597970,0.412379,0.665035,282539.0,,,4.424425,7.254000,...,,,32.237,27.055488,,,294.061198,,67.072998,


In [5]:
threshold = 0.5
df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))

df_impute = df_thresh.fillna(df_thresh.mean())
df_impute

  df_impute = df_thresh.fillna(df_thresh.mean())


Unnamed: 0,Country,Country code,Dystopia residual,Residual-to-happiness ratio,"Air transport, freight (million ton-km)","Air transport, passengers carried","Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Employment in services (% of total employment) (modeled ILO estimate),"Industry (including construction), value added per worker (constant 2010 US$)","Manufacturing, value added (% of GDP)",Urban population (% of total),Individuals using the Internet (% of population),"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Employment in agriculture (% of total employment) (modeled ILO estimate)
0,Afghanistan,AFG,1.952100,0.546042,33.102039,1.929907e+06,2.190994,17.646999,43.674999,2079.555916,11.420006,24.803000,8.260000,921.359625,38.678001
1,Afghanistan,AFG,2.145580,0.638565,29.010881,1.917924e+06,2.138151,17.424999,43.756001,1999.566153,11.370465,25.020000,10.595726,930.799951,38.818001
2,Afghanistan,AFG,2.150801,0.566895,25.144211,1.858558e+06,2.138151,17.457001,43.768002,1923.393447,11.102526,25.250000,11.447688,928.592663,38.773998
3,Afghanistan,AFG,2.196000,0.604626,985.946746,1.814317e+07,13.890841,17.596001,43.816002,23036.702831,12.666669,55.169177,42.805461,20761.228907,38.588001
4,Albania,ALB,1.898940,0.382928,985.946746,1.814317e+07,21.851423,18.617001,40.020000,13648.254549,5.671519,57.434000,63.252933,5378.217225,41.362999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Zambia,ZMB,1.667000,0.380854,985.946746,1.814317e+07,13.890841,10.683000,35.401001,23036.702831,12.666669,55.169177,42.805461,20761.228907,53.916000
453,Zimbabwe,ZWE,2.441910,0.529698,0.796107,3.701650e+05,5.165438,7.290000,25.589001,5962.002633,11.888599,32.385000,22.742818,294.738272,67.121002
454,Zimbabwe,ZWE,2.442700,0.582566,0.804947,3.788030e+05,5.260135,7.218000,25.590000,5930.642813,11.596020,32.296000,23.119989,275.073784,67.192001
455,Zimbabwe,ZWE,1.597970,0.412379,0.665035,2.825390e+05,4.424425,7.254000,25.673000,5866.001706,11.017009,32.237000,27.055488,294.061198,67.072998


In [6]:
train, test = train_test_split(df_impute, test_size=0.2)

In [7]:
def split_X_Y(df: pd.DataFrame, output_col: str):
    cols = set(df.columns)
    independent_cols = set(URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS)
    mutual = cols.intersection(independent_cols)
    independent_vars = df[list(mutual)]

    dependent_cols = list(cols - independent_cols)
    dependent_vars = df[output_col]

    return independent_vars, dependent_vars

**First Model**

In [8]:
X, Y = split_X_Y(train, 'Residual-to-happiness ratio')
x, y = split_X_Y(test, 'Residual-to-happiness ratio')

In [9]:
model = LinearRegression()
model.fit(X, Y)

LinearRegression()

In [10]:
predictions = model.predict(x)

print(predictions)
print(y)

[0.37159091 0.39010878 0.45471947 0.38476213 0.41843284 0.43691599
 0.38846796 0.39756472 0.45380724 0.43265947 0.41898203 0.4847718
 0.4374836  0.50788341 0.22615218 0.39518922 0.38166993 0.39694178
 0.47790871 0.38955828 0.4823943  0.41377409 0.34429139 0.35703576
 0.4462481  0.39805272 0.42239325 0.49800402 0.29440753 0.4921976
 0.51600373 0.41371737 0.38418351 0.46570829 0.47898986 0.43847495
 0.35236644 0.43650207 0.41903023 0.35197464 0.37424277 0.35743874
 0.35649193 0.40938872 0.29426472 0.41771817 0.43627898 0.40880066
 0.44125118 0.3538456  0.43118917 0.4311105  0.43503491 0.43676071
 0.52019688 0.37539839 0.40482687 0.38713211 0.37763509 0.37487907
 0.46332872 0.35826119 0.42482568 0.33962914 0.40621945 0.39346927
 0.43528451 0.36025377 0.33500014 0.39877261 0.48240259 0.50400558
 0.45515955 0.44039939 0.40490134 0.45987166 0.37318918 0.39123828
 0.42710821 0.50728496 0.44467058 0.40095948 0.46577715 0.39884692
 0.37878287 0.50870637 0.49075734 0.30440686 0.45961993 0.395562

In [11]:
coefficients = pd.DataFrame(X.columns, columns=['variable'])
coefficients['coefficient'] = model.coef_

coefficients

Unnamed: 0,variable,coefficient
0,"Industry (including construction), value added...",-6.707341e-07
1,"Air transport, passengers carried",2.615932e-11
2,"Agriculture, forestry, and fishing, value adde...",1.566931e-08
3,Employment in industry (% of total employment)...,-16.97379
4,"Air transport, freight (million ton-km)",1.042684e-06
5,Urban population (% of total),0.001659291
6,"Manufacturing, value added (% of GDP)",0.0006856087
7,Employment in services (% of total employment)...,-16.97174
8,"Commercial bank branches (per 100,000 adults)",0.000330243
9,Employment in agriculture (% of total employme...,-16.97025


In [12]:
model.intercept_

1697.5345955468952

In [13]:
model.score(x, y)

0.2462734484108534

In [14]:
metrics.mean_squared_error(y, predictions)

0.010797915165883697

**Second Model**

In [15]:
X, Y = split_X_Y(train, 'Dystopia residual')
x, y = split_X_Y(test, 'Dystopia residual')

model = LinearRegression()
model.fit(X, Y)

predictions = model.predict(x)

print(predictions)
print(y)

[2.08076541 1.84124556 1.98230096 2.37715069 2.08163223 2.26229386
 1.97963752 2.02362916 2.22985515 1.89191257 1.85027635 2.08731941
 1.89636572 2.15198635 1.36831343 2.00330595 1.91305615 2.37748812
 1.93301551 2.24673847 2.54053494 2.35562679 1.92122315 2.01727268
 2.18345124 1.9993568  2.41893877 2.05727659 1.8384279  2.12626727
 1.807436   2.16825482 2.03013118 2.11125779 2.17007184 1.89824758
 2.01665034 1.91948329 1.99399664 2.02835254 2.00059912 2.0700518
 1.79491669 2.32085759 1.83036536 2.21499023 1.92262054 1.91931872
 2.07312716 1.91569225 1.89003569 1.91869498 2.20766851 2.28124028
 1.96328507 2.32631909 2.04379666 1.98284456 1.59653872 1.98191828
 2.0098057  1.98249279 2.24470047 1.86438378 2.3924925  2.13345533
 2.09315477 1.8332863  1.8471535  2.11800166 2.10085369 1.78094348
 2.10807781 2.08133295 2.28112762 1.89971137 1.89544287 2.08978584
 1.86023705 1.8043269  2.23138954 1.89410685 1.90986462 2.0621292
 2.15309928 1.99881274 1.83390737 1.90470715 2.10499357 2.330009

In [16]:
coefficients = pd.DataFrame(X.columns, columns=['variable'])
coefficients['coefficient'] = model.coef_

coefficients

Unnamed: 0,variable,coefficient
0,"Industry (including construction), value added...",-3.633923e-06
1,"Air transport, passengers carried",9.152041e-10
2,"Agriculture, forestry, and fishing, value adde...",1.792602e-07
3,Employment in industry (% of total employment)...,-72.26738
4,"Air transport, freight (million ton-km)",-6.497597e-06
5,Urban population (% of total),0.009630363
6,"Manufacturing, value added (% of GDP)",0.01020685
7,Employment in services (% of total employment)...,-72.25247
8,"Commercial bank branches (per 100,000 adults)",0.005574521
9,Employment in agriculture (% of total employme...,-72.255


In [17]:
model.intercept_

7227.068158899305

In [18]:
model.score(x, y)

0.020127820817355424

In [19]:
metrics.mean_squared_error(y, predictions)

0.38501317695946613