In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
%run -i columns.py
%run -i helper_functions.py

In [3]:
def split_X_Y(df: pd.DataFrame, output_col: str):
    cols = set(df.columns)
    independent_cols = set(URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS)
    mutual = cols.intersection(independent_cols)
    independent_vars = df[list(mutual)]

    dependent_cols = list(cols - independent_cols)
    dependent_vars = df[output_col]

    return independent_vars, dependent_vars

In [4]:
def create_train_test_sets(df: pd.DataFrame, indicators: list):
    columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + indicators
    df = df[columns] 
    
    threshold = 0.5
    df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))

    df_impute = df_thresh.fillna(df_thresh.mean())
    df_impute
    
    train, test = train_test_split(df_impute, test_size=0.2)
    
    return train, test

In [28]:
class LinearRegressionModel():
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, output_label: str):
        self.output_label = output_label
        
        self.train = train
        self.test = test
        self.X, self.Y = split_X_Y(train.copy(), output_label)
        self.x, self.y = split_X_Y(test.copy(), output_label)
        
        self.model = LinearRegression()
        self.model.fit(self.X, self.Y)
        self.predictions = self.model.predict(self.x)
        
    def get_predictions_table(self) -> pd.DataFrame:
        table = pd.DataFrame(self.x.copy())
        table[f'Predicted {self.output_label}'] = self.predictions
        table[f'Actual {self.output_label}'] = self.y
        
        return table
    
    def get_coefficients_table(self) -> pd.DataFrame:
        coefficients = pd.DataFrame(self.X.columns, columns=['variable'])
        coefficients['coefficient'] = self.model.coef_

        return coefficients
    
    def get_intercept(self) -> float:
        return self.model.intercept_
    
    def get_r_squared(self) -> float:
        return self.model.score(self.x, self.y)
    
    def get_mean_squared_error(self) -> float:
        return metrics.mean_squared_error(self.y, self.predictions)

In [29]:
data = pd.read_csv('tmp/developing_countries.csv')

# Calculate the percentage of unexplained happiness for each happiness score
data['Residual-to-happiness ratio'] = data['Dystopia residual'] / data['Happiness score']

# Analysis of the Effects of Urbanization on the Dystopia Residual

In [30]:
train, test = create_train_test_sets(data.copy(), URBANIZATION_INDICATORS)

  df_impute = df_thresh.fillna(df_thresh.mean())


**Prediction of Dystopia Residual-to-Happiness Ratio Using Positive Indicators of Urbanization**

In [31]:
urbanization_residual_ratio = LinearRegressionModel(train.copy(), test.copy(), 'Residual-to-happiness ratio')

In [32]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Air transport, freight (million ton-km)",Employment in industry (% of total employment) (modeled ILO estimate),"Commercial bank branches (per 100,000 adults)",Urban population (% of total),"Manufacturing, value added (% of GDP)",Individuals using the Internet (% of population),Employment in services (% of total employment) (modeled ILO estimate),"Industry (including construction), value added per worker (constant 2010 US$)","Air transport, passengers carried",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
151,0.546373,20.898001,32.516634,50.680000,18.002159,40.703049,49.785999,10116.577532,1.555240e+05,0.409712,0.448387
295,985.946746,17.142000,11.118123,58.090000,13.761283,24.571834,51.678001,6307.824070,1.814317e+07,0.452880,0.471342
429,985.946746,20.464001,11.977660,95.045000,13.242661,64.600000,70.711998,32044.454777,1.814317e+07,0.414114,0.357968
385,0.025671,29.014999,13.890841,53.500000,12.666669,34.253402,56.291000,23036.702831,1.756400e+04,0.369547,0.306636
455,0.665035,7.254000,4.424425,32.237000,11.017009,27.055488,25.673000,5866.001706,2.825390e+05,0.482579,0.412379
...,...,...,...,...,...,...,...,...,...,...,...
175,1056.044519,22.018000,16.894976,54.659000,20.163900,32.292442,47.192001,16114.509592,1.102529e+08,0.431048,0.271660
67,1.530950,29.895000,60.136678,73.990000,13.621081,56.656300,63.249001,14666.319798,1.063498e+06,0.377605,0.213350
456,985.946746,7.220000,13.890841,55.169177,12.666669,42.805461,25.580999,23036.702831,1.814317e+07,0.495924,0.367281
434,112.281730,30.171000,27.958746,50.650000,12.666669,46.791287,36.148998,4546.225803,2.383631e+06,0.399414,0.391912


In [33]:
urbanization_residual_ratio.get_coefficients_table()

Unnamed: 0,variable,coefficient
0,"Air transport, freight (million ton-km)",-2.057576e-06
1,Employment in industry (% of total employment)...,-0.003516312
2,"Commercial bank branches (per 100,000 adults)",2.326344e-06
3,Urban population (% of total),0.001922774
4,"Manufacturing, value added (% of GDP)",0.0001463437
5,Individuals using the Internet (% of population),-0.001036161
6,Employment in services (% of total employment)...,-0.001809296
7,"Industry (including construction), value added...",-8.607865e-07
8,"Air transport, passengers carried",1.023643e-10


In [34]:
urbanization_residual_ratio.get_r_squared()

0.16807238653812961

In [35]:
urbanization_residual_ratio.get_mean_squared_error()

0.007884241009609525

**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [36]:
urbanization_residual_value = LinearRegressionModel(train.copy(), test.copy(), 'Dystopia residual')

In [37]:
urbanization_residual_value.get_predictions_table()

Unnamed: 0,"Air transport, freight (million ton-km)",Employment in industry (% of total employment) (modeled ILO estimate),"Commercial bank branches (per 100,000 adults)",Urban population (% of total),"Manufacturing, value added (% of GDP)",Individuals using the Internet (% of population),Employment in services (% of total employment) (modeled ILO estimate),"Industry (including construction), value added per worker (constant 2010 US$)","Air transport, passengers carried",Predicted Dystopia residual,Actual Dystopia residual
151,0.546373,20.898001,32.516634,50.680000,18.002159,40.703049,49.785999,10116.577532,1.555240e+05,2.179257,2.893891
295,985.946746,17.142000,11.118123,58.090000,13.761283,24.571834,51.678001,6307.824070,1.814317e+07,2.192642,2.824280
429,985.946746,20.464001,11.977660,95.045000,13.242661,64.600000,70.711998,32044.454777,1.814317e+07,2.426974,2.321420
385,0.025671,29.014999,13.890841,53.500000,12.666669,34.253402,56.291000,23036.702831,1.756400e+04,1.922141,1.061574
455,0.665035,7.254000,4.424425,32.237000,11.017009,27.055488,25.673000,5866.001706,2.825390e+05,1.961975,1.597970
...,...,...,...,...,...,...,...,...,...,...,...
175,1056.044519,22.018000,16.894976,54.659000,20.163900,32.292442,47.192001,16114.509592,1.102529e+08,2.268750,1.429477
67,1.530950,29.895000,60.136678,73.990000,13.621081,56.656300,63.249001,14666.319798,1.063498e+06,2.375992,0.899910
456,985.946746,7.220000,13.890841,55.169177,12.666669,42.805461,25.580999,23036.702831,1.814317e+07,2.176198,1.356000
434,112.281730,30.171000,27.958746,50.650000,12.666669,46.791287,36.148998,4546.225803,2.383631e+06,1.980192,2.346380


In [38]:
urbanization_residual_value.get_r_squared()

-0.07323286765571546

In [39]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Air transport, freight (million ton-km)",Employment in industry (% of total employment) (modeled ILO estimate),"Commercial bank branches (per 100,000 adults)",Urban population (% of total),"Manufacturing, value added (% of GDP)",Individuals using the Internet (% of population),Employment in services (% of total employment) (modeled ILO estimate),"Industry (including construction), value added per worker (constant 2010 US$)","Air transport, passengers carried",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
151,0.546373,20.898001,32.516634,50.680000,18.002159,40.703049,49.785999,10116.577532,1.555240e+05,0.409712,0.448387
295,985.946746,17.142000,11.118123,58.090000,13.761283,24.571834,51.678001,6307.824070,1.814317e+07,0.452880,0.471342
429,985.946746,20.464001,11.977660,95.045000,13.242661,64.600000,70.711998,32044.454777,1.814317e+07,0.414114,0.357968
385,0.025671,29.014999,13.890841,53.500000,12.666669,34.253402,56.291000,23036.702831,1.756400e+04,0.369547,0.306636
455,0.665035,7.254000,4.424425,32.237000,11.017009,27.055488,25.673000,5866.001706,2.825390e+05,0.482579,0.412379
...,...,...,...,...,...,...,...,...,...,...,...
175,1056.044519,22.018000,16.894976,54.659000,20.163900,32.292442,47.192001,16114.509592,1.102529e+08,0.431048,0.271660
67,1.530950,29.895000,60.136678,73.990000,13.621081,56.656300,63.249001,14666.319798,1.063498e+06,0.377605,0.213350
456,985.946746,7.220000,13.890841,55.169177,12.666669,42.805461,25.580999,23036.702831,1.814317e+07,0.495924,0.367281
434,112.281730,30.171000,27.958746,50.650000,12.666669,46.791287,36.148998,4546.225803,2.383631e+06,0.399414,0.391912


# Analysis of the Effects of Negative Indicators of Urbanization on the Dystopia Residual

In [40]:
train, test = create_train_test_sets(data.copy(), ANTI_URBANIZATION_INDICATORS)

  df_impute = df_thresh.fillna(df_thresh.mean())


**Prediction of Dystopia Residual-to-Happiness Ratio Using Negative Indicators of Urbanization**

In [41]:
neg_urbanization_residual_ratio = LinearRegressionModel(train.copy(), test.copy(), 'Residual-to-happiness ratio')

In [42]:
neg_urbanization_residual_ratio.get_r_squared()

0.21309374936335967

**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [43]:
neg_urbanization_residual_value = LinearRegressionModel(train.copy(), test.copy(), 'Dystopia residual')

In [44]:
neg_urbanization_residual_value.get_r_squared()

-0.007702100806088907