In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics

In [2]:
%run -i columns.py
%run -i helper_functions.py

In [3]:
def split_X_Y(df: pd.DataFrame, output_col: str):
    cols = set(df.columns)
    independent_cols = set(URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS)
    mutual = cols.intersection(independent_cols)
    independent_vars = df[list(mutual)]

    dependent_cols = list(cols - independent_cols)
    dependent_vars = df[output_col]

    return independent_vars, dependent_vars

In [4]:
def create_train_test_sets(df: pd.DataFrame, indicators: list):
    columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + indicators
    df = df[columns] 
    
    threshold = 0.5
    df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))

    df_impute = df_thresh.fillna(df_thresh.mean())
    df_impute
    
    train, test = train_test_split(df_impute, test_size=0.2)
    
    return train, test

In [35]:
class LinearRegressionModel():
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, output_label: str):
        self.output_label = output_label
        
        self.train = train
        self.test = test
        self.X, self.Y = split_X_Y(train.copy(), output_label)
        self.x, self.y = split_X_Y(test.copy(), output_label)
        
        self.model = LinearRegression()
        self.model.fit(self.X, self.Y)
        self.predictions = self.model.predict(self.x)
        
    def get_predictions_table(self) -> pd.DataFrame:
        table = pd.DataFrame(self.x.copy())
        table[f'Predicted {self.output_label}'] = self.predictions
        table[f'Actual {self.output_label}'] = self.y
        
        return table
    
    def get_coefficients_table(self) -> pd.DataFrame:
        coefficients = pd.DataFrame(self.X.columns, columns=['variable'])
        coefficients['coefficient'] = self.model.coef_

        return coefficients
    
    def get_intercept(self) -> float:
        return self.model.intercept_
    
    def get_r_squared(self) -> float:
        return self.model.score(self.x, self.y)
    
    def get_mean_squared_error(self) -> float:
        return metrics.mean_squared_error(self.y, self.predictions)
    
    def get_scree_plot(self) -> alt.Chart:
        k = len(self.X.columns)
        
        scaled_values = StandardScaler().fit(self.X).transform(self.X)
        pca = PCA(n_components=k).fit(scaled_values)
        
        scree_plot_data = pd.DataFrame()
        scree_plot_data['indices'] = np.arange(k) + 1
        scree_plot_data['var_explained'] = pca.explained_variance_ratio_
        
        return alt.Chart(scree_plot_data, title='Scree plot').mark_line().encode(
            x=alt.X('indices', title='Principal component'),
            y=alt.Y('var_explained', title='Variance explained')
        )
    
    def get_pca_chart(self, k: int) -> alt.Chart:
        inputs = pd.concat([self.X.copy(), self.x.copy()])
        outputs = pd.concat([self.Y.copy(), self.y.copy()])
        
        scaled_inputs = StandardScaler().fit(inputs).transform(inputs)
        pca = PCA(n_components=k).fit(scaled_inputs)
        
        results = pd.DataFrame()
        
        for i in range(len(pca.components_)):
            results[f'PC{i}'] = pca.components_[i]
            
        results[self.output_label] = outputs
        
        return alt.Chart(results).mark_point().encode(
            x='PC0',
            y=self.output_label
        ).interactive()
        

In [36]:
data = pd.read_csv('tmp/developing_countries.csv')

# Calculate the percentage of unexplained happiness for each happiness score
data['Residual-to-happiness ratio'] = data['Dystopia residual'] / data['Happiness score']

# Analysis of the Effects of Urbanization on the Dystopia Residual

In [37]:
train, test = create_train_test_sets(data.copy(), URBANIZATION_INDICATORS)

  df_impute = df_thresh.fillna(df_thresh.mean())


**Prediction of Dystopia Residual-to-Happiness Ratio Using Positive Indicators of Urbanization**

In [38]:
urbanization_residual_ratio = LinearRegressionModel(train.copy(), test.copy(), 'Residual-to-happiness ratio')

In [39]:
urbanization_residual_ratio.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [40]:
urbanization_residual_ratio.get_pca_chart(1)

  for col_name, dtype in df.dtypes.iteritems():


In [41]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,Employment in services (% of total employment) (modeled ILO estimate),"Manufacturing, value added (% of GDP)","Air transport, freight (million ton-km)",Employment in industry (% of total employment) (modeled ILO estimate),Urban population (% of total),"Commercial bank branches (per 100,000 adults)",Individuals using the Internet (% of population),"Air transport, passengers carried","Industry (including construction), value added per worker (constant 2010 US$)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
359,21.113001,12.666669,985.946746,6.435000,43.816000,13.890841,1.880000,4.486000e+03,23036.702831,0.508780,0.705463
201,56.056999,15.621025,3.217000,24.483999,55.696000,29.766144,65.317025,2.427047e+06,16140.396982,0.347035,0.494863
394,26.576000,12.666669,985.946746,7.073000,55.169177,13.890841,42.805461,1.814317e+07,23036.702831,0.470230,0.190130
411,53.723000,16.598631,3493.930482,26.775999,74.134000,18.139831,58.347734,1.003665e+08,41270.763008,0.383995,0.425077
139,52.215000,17.073523,985.946746,9.938000,88.976000,13.890841,50.320120,1.814317e+07,156610.225372,0.387811,0.296286
...,...,...,...,...,...,...,...,...,...,...,...
102,64.168999,12.400894,1316.795809,19.825001,79.764000,15.908311,55.904973,3.090972e+07,23243.456126,0.407463,0.441156
38,58.535000,22.221599,1.670000,30.768000,78.134000,0.783214,74.436445,2.493100e+06,14876.356458,0.364803,0.309433
114,66.605003,12.476185,0.582750,26.416000,56.667000,32.502855,67.096192,2.057804e+06,31460.108633,0.319064,0.350367
354,54.362999,12.666669,985.946746,13.606000,55.169177,13.890841,42.805461,1.814317e+07,23036.702831,0.422734,0.437487


In [42]:
urbanization_residual_ratio.get_coefficients_table()

Unnamed: 0,variable,coefficient
0,Employment in services (% of total employment)...,-0.0009504456
1,"Manufacturing, value added (% of GDP)",-0.0002604089
2,"Air transport, freight (million ton-km)",-1.9531e-06
3,Employment in industry (% of total employment)...,-0.003227659
4,Urban population (% of total),0.001190509
5,"Commercial bank branches (per 100,000 adults)",-0.000883298
6,Individuals using the Internet (% of population),-0.001201188
7,"Air transport, passengers carried",2.395372e-10
8,"Industry (including construction), value added...",-5.905131e-07


In [43]:
urbanization_residual_ratio.get_r_squared()

0.21320409040445065

In [44]:
urbanization_residual_ratio.get_mean_squared_error()

0.010812451457616253

**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [45]:
urbanization_residual_value = LinearRegressionModel(train.copy(), test.copy(), 'Dystopia residual')

In [16]:
urbanization_residual_value.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [17]:
urbanization_residual_value.get_pca_chart(1)

  for col_name, dtype in df.dtypes.iteritems():


In [18]:
urbanization_residual_value.get_predictions_table()

Unnamed: 0,Employment in services (% of total employment) (modeled ILO estimate),"Manufacturing, value added (% of GDP)","Air transport, freight (million ton-km)",Employment in industry (% of total employment) (modeled ILO estimate),Urban population (% of total),"Commercial bank branches (per 100,000 adults)",Individuals using the Internet (% of population),"Air transport, passengers carried","Industry (including construction), value added per worker (constant 2010 US$)",Predicted Dystopia residual,Actual Dystopia residual
418,21.511000,8.704258,0.023472,7.152000,22.060000,2.901283,17.834859,4.181200e+04,4742.299696,1.910112,1.427660
21,50.617001,10.284650,985.946746,15.784000,63.082000,23.098070,64.346030,1.814317e+07,23036.702831,2.125995,1.978640
193,61.425999,10.275409,37.669008,20.562000,57.191000,2.966006,70.829934,5.081632e+06,37388.077128,2.026208,2.247290
354,54.362999,12.666669,985.946746,13.606000,55.169177,13.890841,42.805461,1.814317e+07,23036.702831,2.159745,2.026000
145,46.137001,11.367037,0.844630,18.684999,54.086000,7.120122,31.447852,3.904570e+05,5020.232317,2.153496,2.309190
...,...,...,...,...,...,...,...,...,...,...,...
206,57.484001,12.666669,7.355192,25.298000,55.942000,28.226565,70.330836,2.442731e+06,15205.354928,1.959904,1.947084
451,35.173000,7.552274,81.557610,10.636000,42.976000,4.513118,27.852579,1.545730e+05,12562.920681,2.074138,1.826705
40,69.795998,6.503924,2.689022,15.152000,45.495000,20.424837,44.575740,9.904990e+05,9243.607206,2.129207,3.080390
396,45.159000,27.390641,2160.069653,23.677000,48.448000,12.375701,47.504966,6.234168e+07,16632.365708,1.995886,2.579600


In [19]:
urbanization_residual_value.get_r_squared()

-0.027105666981012355

In [20]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,Employment in services (% of total employment) (modeled ILO estimate),"Manufacturing, value added (% of GDP)","Air transport, freight (million ton-km)",Employment in industry (% of total employment) (modeled ILO estimate),Urban population (% of total),"Commercial bank branches (per 100,000 adults)",Individuals using the Internet (% of population),"Air transport, passengers carried","Industry (including construction), value added per worker (constant 2010 US$)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
418,21.511000,8.704258,0.023472,7.152000,22.060000,2.901283,17.834859,4.181200e+04,4742.299696,0.499603,0.363180
21,50.617001,10.284650,985.946746,15.784000,63.082000,23.098070,64.346030,1.814317e+07,23036.702831,0.403227,0.453817
193,61.425999,10.275409,37.669008,20.562000,57.191000,2.966006,70.829934,5.081632e+06,37388.077128,0.359693,0.383824
354,54.362999,12.666669,985.946746,13.606000,55.169177,13.890841,42.805461,1.814317e+07,23036.702831,0.426580,0.437487
145,46.137001,11.367037,0.844630,18.684999,54.086000,7.120122,31.447852,3.904570e+05,5020.232317,0.451730,0.498422
...,...,...,...,...,...,...,...,...,...,...,...
206,57.484001,12.666669,7.355192,25.298000,55.942000,28.226565,70.330836,2.442731e+06,15205.354928,0.337318,0.360905
451,35.173000,7.552274,81.557610,10.636000,42.976000,4.513118,27.852579,1.545730e+05,12562.920681,0.481994,0.404676
40,69.795998,6.503924,2.689022,15.152000,45.495000,20.424837,44.575740,9.904990e+05,9243.607206,0.385800,0.517191
396,45.159000,27.390641,2160.069653,23.677000,48.448000,12.375701,47.504966,6.234168e+07,16632.365708,0.387511,0.398455


# Analysis of the Effects of Negative Indicators of Urbanization on the Dystopia Residual

In [21]:
train, test = create_train_test_sets(data.copy(), ANTI_URBANIZATION_INDICATORS)

  df_impute = df_thresh.fillna(df_thresh.mean())


**Prediction of Dystopia Residual-to-Happiness Ratio Using Negative Indicators of Urbanization**

In [22]:
neg_urbanization_residual_ratio = LinearRegressionModel(train.copy(), test.copy(), 'Residual-to-happiness ratio')

In [30]:
neg_urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Employment in agriculture (% of total employment) (modeled ILO estimate),Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
85,1071.264748,46.575001,0.441748,0.497049
367,19126.126853,4.818000,0.350147,0.402768
237,263.781805,68.602997,0.490379,0.582070
444,20761.228907,39.796001,0.427473,0.283559
228,20761.228907,8.281000,0.357855,0.363340
...,...,...,...,...
430,25383.026756,8.224000,0.357894,0.408157
169,1512.488951,45.558998,0.439519,0.498386
429,23018.482878,8.824000,0.359135,0.357968
423,4654.288886,15.600000,0.373450,0.347054


In [31]:
neg_urbanization_residual_ratio.get_r_squared()

0.23803636427672292

In [32]:
neg_urbanization_residual_ratio.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [25]:
neg_urbanization_residual_ratio.get_pca_chart(1)

  for col_name, dtype in df.dtypes.iteritems():


**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [26]:
neg_urbanization_residual_value = LinearRegressionModel(train.copy(), test.copy(), 'Dystopia residual')

In [27]:
neg_urbanization_residual_value.get_r_squared()

-0.0019275269788932015

In [28]:
neg_urbanization_residual_value.get_scree_plot()

  for col_name, dtype in df.dtypes.iteritems():


In [29]:
neg_urbanization_residual_value.get_pca_chart(1)

  for col_name, dtype in df.dtypes.iteritems():
