In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import metrics

In [2]:
%run -i columns.py
%run -i helper_functions.py

In [3]:
def split_X_Y(df: pd.DataFrame, output_col: str):
    cols = set(df.columns)
    independent_cols = set(URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS)
    mutual = cols.intersection(independent_cols)
    independent_vars = df[list(mutual)]

    dependent_cols = list(cols - independent_cols)
    dependent_vars = df[output_col]

    return independent_vars, dependent_vars

In [4]:
def create_train_test_sets(df: pd.DataFrame, indicators: list):
    columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + indicators
    df = df[columns] 
    
    threshold = 0.5
    df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))

    df_impute = df_thresh.fillna(df_thresh.mean())
    df_impute
    
    train, test = train_test_split(df_impute, test_size=0.2)
    
    return train, test

In [5]:
class LinearRegressionModel():
    def __init__(self, df: pd.DataFrame, x: list, y: str):
        self.train, self.test = create_train_test_sets(df.copy(), x)
        self.df = pd.concat([self.train.copy(), self.test.copy()])
        
        self.x = list(set(x).intersection(self.train.columns))
        self.y = y
        
        self.model = LinearRegression()
        self.model.fit(self.train[self.x], self.train[self.y])
        self.predictions = self.model.predict(self.test[self.x])
        
    def get_predictions_table(self) -> pd.DataFrame:
        table = pd.DataFrame(self.test[self.x].copy())
        table[f'Predicted {self.y}'] = list(self.predictions)
        table[f'Actual {self.y}'] = list(self.test[self.y])
        
        return table
    
    def get_coefficients_table(self) -> pd.DataFrame:
        coefficients = pd.DataFrame(self.train[self.x].columns, columns=['variable'])
        coefficients['coefficient'] = self.model.coef_

        return coefficients
    
    def get_intercept(self) -> float:
        return self.model.intercept_
    
    def get_r_squared(self) -> float:
        return self.model.score(self.train[self.x], self.train[self.y])
    
    def get_mean_squared_error(self) -> float:
        return metrics.mean_squared_error(self.test[self.y], self.predictions)
    
    def get_scree_plot(self) -> alt.Chart:
        k = len(self.train[self.x].columns)
        
        scaled_values = StandardScaler().fit(self.train[self.x]).transform(self.train[self.x])
        pca = PCA(n_components=k).fit(scaled_values)
        
        scree_plot_data = pd.DataFrame()
        scree_plot_data['indices'] = np.arange(k) + 1
        scree_plot_data['var_explained'] = pca.explained_variance_ratio_
        
        return alt.Chart(scree_plot_data, title='Scree plot').mark_line().encode(
            x=alt.X('indices', title='Principal component'),
            y=alt.Y('var_explained', title='Variance explained')
        )
    
    def visualize(self) -> alt.Chart:        
        scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
        pca = PCA(n_components=1).fit(scaled_inputs).transform(scaled_inputs)
        
        results = pd.DataFrame(pca, columns=['PC0'])
            
        results[self.y] = self.df[self.y]
        results['Country'] = self.df['Country code']
        
        chart = alt.Chart(results, title=f'PC0 vs {self.y}').mark_point().encode(
            x='PC0',
            y=self.y,
            color='Country:N',
            tooltip=['Country', 'PC0', self.y]
        )
        
        # Based on https://stackoverflow.com/questions/66604052/altair-extract-and-display-regression-coefficients
        regression = chart.transform_regression('PC0', self.y).mark_line(color='black').encode(
            color=alt.Color(legend=None),
        )
        parameters = chart.transform_regression('PC0', self.y, params=True).transform_calculate(
            intercept='datum.coef[0]',
            slope='datum.coef[1]',
        ).mark_text(align='left', color='black').encode(
            x=alt.value(300), 
            y=alt.value(20),
            text='slope:Q',
            tooltip=[alt.Tooltip('slope:Q', title='Slope'), alt.Tooltip('intercept:Q', title='Intercept')]
        )
        
        return alt.layer(chart, regression, parameters).resolve_scale(
            color='independent'
        ).interactive()
        

In [6]:
data = pd.read_csv('tmp/developing_countries.csv')

# Calculate the percentage of unexplained happiness for each happiness score
data['Residual-to-happiness ratio'] = data['Dystopia residual'] / data['Happiness score']

In [7]:
data['Dystopia residual'].mean()

2.030955390437771

In [8]:
data['Residual-to-happiness ratio'].mean()

0.4079498037128602

# Analysis of the Effects of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Positive Indicators of Urbanization**

In [9]:
urbanization_residual_ratio = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [10]:
urbanization_residual_ratio.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [11]:
urbanization_residual_ratio.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [12]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Air transport, passengers carried","Air transport, freight (million ton-km)","Manufacturing, value added (% of GDP)",Employment in services (% of total employment) (modeled ILO estimate),Urban population (% of total),Individuals using the Internet (% of population),"Industry (including construction), value added per worker (constant 2010 US$)","Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
389,7.963830e+05,3.950726,12.666669,32.080002,26.982000,21.960000,7488.187230,13.890841,16.638000,0.434440,0.366802
287,5.587130e+05,21.084889,11.022128,60.595001,47.961000,31.033346,26478.131237,14.697603,19.334000,0.395082,0.420197
438,6.025475e+06,3.996713,12.666669,71.545998,88.165000,60.000000,23036.702831,13.890841,21.195999,0.400797,0.488935
439,4.155865e+06,1.906885,12.666669,71.635002,88.183000,64.313364,23036.702831,13.890841,21.150000,0.396604,0.340850
357,1.814317e+07,985.946746,1.957707,35.145000,41.636000,13.236930,2253.499774,13.890841,5.611000,0.506386,0.566672
...,...,...,...,...,...,...,...,...,...,...,...
314,1.814317e+07,985.946746,12.666669,67.075996,55.169177,42.805461,23036.702831,13.890841,18.580000,0.390367,0.363297
279,1.814317e+07,985.946746,12.666669,40.311001,55.169177,42.805461,23036.702831,13.890841,21.596001,0.413943,0.466121
183,2.170504e+06,1.303086,12.666669,57.530998,70.278000,49.359999,62601.726135,3.997868,23.458000,0.386243,0.293286
386,1.814317e+07,985.946746,12.666669,56.387001,55.169177,42.805461,23036.702831,13.890841,28.948000,0.360644,0.359330


In [13]:
urbanization_residual_ratio.get_coefficients_table()

Unnamed: 0,variable,coefficient
0,"Air transport, passengers carried",9.448803e-11
1,"Air transport, freight (million ton-km)",-1.255787e-06
2,"Manufacturing, value added (% of GDP)",0.0001265817
3,Employment in services (% of total employment)...,-0.001362146
4,Urban population (% of total),0.001334163
5,Individuals using the Internet (% of population),-0.0009548183
6,"Industry (including construction), value added...",-3.998812e-07
7,"Commercial bank branches (per 100,000 adults)",-0.0005966896
8,Employment in industry (% of total employment)...,-0.004271077


In [14]:
urbanization_residual_ratio.get_intercept()

0.5437786353208058

In [15]:
urbanization_residual_ratio.get_r_squared()

0.28682894260842073

In [16]:
urbanization_residual_ratio.get_mean_squared_error()

0.009242035955834896

**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [17]:
urbanization_residual_value = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Dystopia residual')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [18]:
urbanization_residual_value.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [19]:
urbanization_residual_value.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [20]:
urbanization_residual_value.get_predictions_table()

Unnamed: 0,"Air transport, passengers carried","Air transport, freight (million ton-km)","Manufacturing, value added (% of GDP)",Employment in services (% of total employment) (modeled ILO estimate),Urban population (% of total),Individuals using the Internet (% of population),"Industry (including construction), value added per worker (constant 2010 US$)","Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Predicted Dystopia residual,Actual Dystopia residual
442,3.734927e+07,451.256180,14.265514,33.373001,34.510000,46.500000,4149.751287,3.796799,24.761999,1.834692,1.94180
308,9.628354e+06,175.474400,12.080755,34.132999,36.234000,15.514558,2881.172577,10.357397,23.591999,1.835183,3.18286
134,8.242114e+06,1500.147530,5.577928,21.096001,19.866000,15.366924,1606.852714,13.890841,10.873000,1.711354,2.65614
117,1.446300e+04,985.946746,13.897199,71.918999,78.566000,54.215766,26120.900466,12.570840,18.365000,2.347176,1.21305
314,1.814317e+07,985.946746,12.666669,67.075996,55.169177,42.805461,23036.702831,13.890841,18.580000,2.121434,2.33600
...,...,...,...,...,...,...,...,...,...,...,...
427,9.176360e+07,16138.874000,8.984418,72.692001,85.965000,90.600007,125144.432025,12.415009,23.518000,1.589323,2.21507
417,1.814317e+07,985.946746,12.666669,43.518002,55.169177,42.805461,23036.702831,13.890841,33.706001,1.814284,1.91700
35,1.814317e+07,985.946746,12.666669,39.377998,55.169177,42.805461,23036.702831,13.890841,20.472000,1.982474,1.66200
112,1.782666e+06,0.775320,12.504204,64.059998,56.155000,69.803100,30000.004192,33.600583,26.712000,1.961061,2.75414


In [21]:
urbanization_residual_value.get_r_squared()

0.10644631453301256

In [22]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Air transport, passengers carried","Air transport, freight (million ton-km)","Manufacturing, value added (% of GDP)",Employment in services (% of total employment) (modeled ILO estimate),Urban population (% of total),Individuals using the Internet (% of population),"Industry (including construction), value added per worker (constant 2010 US$)","Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
389,7.963830e+05,3.950726,12.666669,32.080002,26.982000,21.960000,7488.187230,13.890841,16.638000,0.434440,0.366802
287,5.587130e+05,21.084889,11.022128,60.595001,47.961000,31.033346,26478.131237,14.697603,19.334000,0.395082,0.420197
438,6.025475e+06,3.996713,12.666669,71.545998,88.165000,60.000000,23036.702831,13.890841,21.195999,0.400797,0.488935
439,4.155865e+06,1.906885,12.666669,71.635002,88.183000,64.313364,23036.702831,13.890841,21.150000,0.396604,0.340850
357,1.814317e+07,985.946746,1.957707,35.145000,41.636000,13.236930,2253.499774,13.890841,5.611000,0.506386,0.566672
...,...,...,...,...,...,...,...,...,...,...,...
314,1.814317e+07,985.946746,12.666669,67.075996,55.169177,42.805461,23036.702831,13.890841,18.580000,0.390367,0.363297
279,1.814317e+07,985.946746,12.666669,40.311001,55.169177,42.805461,23036.702831,13.890841,21.596001,0.413943,0.466121
183,2.170504e+06,1.303086,12.666669,57.530998,70.278000,49.359999,62601.726135,3.997868,23.458000,0.386243,0.293286
386,1.814317e+07,985.946746,12.666669,56.387001,55.169177,42.805461,23036.702831,13.890841,28.948000,0.360644,0.359330


# Analysis of the Effects of Negative Indicators of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Negative Indicators of Urbanization**

In [23]:
neg_urbanization_residual_ratio = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [24]:
neg_urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Employment in agriculture (% of total employment) (modeled ILO estimate),Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
129,2530.771287,18.111000,0.375578,0.495106
339,13194.745110,6.710000,0.349473,0.397820
113,15009.094757,7.602000,0.351743,0.460026
422,4421.384221,15.260000,0.368983,0.335698
149,2832.415337,31.875999,0.408227,0.447910
...,...,...,...,...
345,569.989801,67.080002,0.491469,0.155592
269,4538.154879,30.360001,0.404781,0.312994
23,20761.228907,33.292000,0.413124,0.343439
161,2184.958697,28.722000,0.400697,0.385146


In [25]:
neg_urbanization_residual_ratio.get_intercept()

0.3324359778347371

In [26]:
neg_urbanization_residual_ratio.get_coefficients_table()

Unnamed: 0,variable,coefficient
0,"Agriculture, forestry, and fishing, value adde...",8.593849e-08
1,Employment in agriculture (% of total employme...,0.002370061


In [27]:
neg_urbanization_residual_ratio.get_r_squared()

0.21228592772508892

In [28]:
neg_urbanization_residual_ratio.get_mean_squared_error()

0.009965190356889631

In [29]:
neg_urbanization_residual_ratio.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [30]:
neg_urbanization_residual_ratio.visualize()

  for col_name, dtype in df.dtypes.iteritems():


**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [31]:
neg_urbanization_residual_value = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Dystopia residual')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [32]:
neg_urbanization_residual_value.get_r_squared()

0.015765624074273843

In [33]:
neg_urbanization_residual_value.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [34]:
neg_urbanization_residual_value.visualize()

  for col_name, dtype in df.dtypes.iteritems():
