In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics

In [2]:
# Import dataframe column names and helper functions
%run -i columns.py
%run -i helper_functions.py

In [3]:
"""
Creates a train set and a test set from the given given dataframe instance. The name and code of the country, its
Dystopia residual value, and its Dystopia residual-to-happiness score ratio are always present in the resulting datasets. 
Any other variable to be included in the dataset must be named in the given list of indicators.

Parameters:
    df (pd.DataFrame): The dataframe of developing countries to process and split into train and test sets
    indicators (list): The list of positive or negative indicators of urbanization to include in the resulting datasets

Returns: two dataframes, a train set which contains 80% of the original dataframe's entries, and a test set containing
         the remaining entries
"""
def create_train_test_sets(df: pd.DataFrame, indicators: list) -> (pd.DataFrame, pd.DataFrame):
    columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + indicators
    df = df[columns] 
    
    # Discard any indicators for which more than 50% of the dataframe's entries are missing data
    threshold = 0.5
    df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))
    
    # Impute the remaining NaN values using the mean imputation method
    df_impute = df_thresh.fillna(df_thresh.mean())
    
    train, test = train_test_split(df_impute, test_size=0.2)
    
    return train, test

In [4]:
"""
A multiple linear regression model.
"""
class LinearRegressionModel():
    """
    Creates a new model using the given dataframe, in which the independent variables are in a list of variable names, x, and
    the name of the dependent variable is a string, y.
    """
    def __init__(self, df: pd.DataFrame, x: list, y: str):
        # Prepare the train and test datasets to be used by the model
        self.train, self.test = create_train_test_sets(df.copy(), x)
        self.df = pd.concat([self.train.copy(), self.test.copy()])
        
        self.x = list(set(x).intersection(self.train.columns))
        self.y = y
        
        # Train the model using the train set, then calculate the predicted outputs using the test set
        self.model = LinearRegression()
        self.model.fit(self.train[self.x], self.train[self.y])
        self.predictions = self.model.predict(self.test[self.x])
        
    """
    Returns a copy of the test dataframe which contains the predicted and actual values of the dependent variable
    for all entries.
    """
    def get_predictions_table(self) -> pd.DataFrame:
        table = pd.DataFrame(self.test[self.x].copy())
        table[f'Predicted {self.y}'] = list(self.predictions)
        table[f'Actual {self.y}'] = list(self.test[self.y])
        
        return table
    
    """
    Returns a dataframe that contains the coefficients of all independent variables.
    """
    def get_coefficients_table(self) -> pd.DataFrame:
        coefficients = pd.DataFrame(self.train[self.x].columns, columns=['variable'])
        coefficients['coefficient'] = self.model.coef_

        return coefficients
    
    """
    Returns the y-intercept of the model.
    """
    def get_intercept(self) -> float:
        return self.model.intercept_
    
    """
    Returns the R^2 value of the model.
    """
    def get_r_squared(self) -> float:
        return self.model.score(self.train[self.x], self.train[self.y])
    
    """
    Returns the mean squared error of the model's predictinos.
    """
    def get_mean_squared_error(self) -> float:
        return metrics.mean_squared_error(self.test[self.y], self.predictions)
    
    """
    Displays the R^2 value, MSE, y-intercept, and coefficients of the model.
    """
    def display_properties(self) -> pd.DataFrame:
        print("R\u00B2:", urbanization_residual_ratio.get_r_squared())
        print("MSE:", urbanization_residual_ratio.get_mean_squared_error())

        print("Y-intercept:", urbanization_residual_ratio.get_intercept())
        print("Coefficients:")
        return urbanization_residual_ratio.get_coefficients_table()
    
    """
    Performs PCA on the train dataset, and returns a scree plot.
    """
    def get_scree_plot(self) -> alt.Chart:
        # Set K to be the number of independent variables
        k = len(self.train[self.x].columns)
        
        # Scale the independent variables, and perform PCA using them
        scaled_values = StandardScaler().fit(self.train[self.x]).transform(self.train[self.x])
        pca = PCA(n_components=k).fit(scaled_values)
        
        # Produce a dataframe for the scree plot
        scree_plot_data = pd.DataFrame()
        scree_plot_data['indices'] = np.arange(k) + 1
        scree_plot_data['var_explained'] = pca.explained_variance_ratio_
        
        return alt.Chart(scree_plot_data, title='Scree plot').mark_line().encode(
            x=alt.X('indices', title='Principal component'),
            y=alt.Y('var_explained', title='Variance explained')
        )
    
    """
    Creates a scree plot and a 2D visualization of the data using PCA with a K value of 1.
    """
    def visualize(self) -> alt.Chart:
        # Scale the inputs of the dataframe, and perform PCA with K = 1
        scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
        pca = PCA(n_components=1).fit(scaled_inputs).transform(scaled_inputs)
        
        # Create a dataframe for the PCA results
        results = pd.DataFrame(pca, columns=['PC0'])
        results[self.y] = self.df[self.y]
        results['Country'] = self.df['Country code']
        
        # Produce a scatter plot of the principal component vs. the Dystopia residual-to-happiness ratio
        chart = alt.Chart(results, title=f'PC0 vs {self.y}').mark_point().encode(
            x='PC0',
            y=self.y,
            color='Country:N',
            tooltip=['Country', 'PC0', self.y]
        )
        
        # Produce a linear regression for the scatter plot
        # Based on https://stackoverflow.com/questions/66604052/altair-extract-and-display-regression-coefficients
        regression = chart.transform_regression('PC0', self.y).mark_line(color='black').encode(
            color=alt.Color(legend=None),
        )
        parameters = chart.transform_regression('PC0', self.y, params=True).transform_calculate(
            intercept='datum.coef[0]',
            slope='datum.coef[1]',
        ).mark_text(align='left', color='black').encode(
            x=alt.value(300), 
            y=alt.value(20),
            text='slope:Q',
            tooltip=[alt.Tooltip('slope:Q', title='Slope'), alt.Tooltip('intercept:Q', title='Intercept')]
        )
        
        return self.get_scree_plot() | alt.layer(chart, regression, parameters).resolve_scale(
            color='independent'
        ).interactive()
        

In [5]:
data = pd.read_csv('tmp/developing_countries.csv')

# Calculate the percentage of unexplained happiness for each happiness score
data['Residual-to-happiness ratio'] = data['Dystopia residual'] / data['Happiness score']

# Analysis of the Effects of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Positive Indicators of Urbanization**

In [6]:
urbanization_residual_ratio = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [7]:
urbanization_residual_ratio.display_properties()

R²: 0.2525656083377493
MSE: 0.008946377167764977
Y-intercept: 0.5251184113143699
Coefficients:


Unnamed: 0,variable,coefficient
0,"Manufacturing, value added (% of GDP)",-0.0002232389
1,"Industry (including construction), value added...",-7.646115e-07
2,Employment in services (% of total employment)...,-0.001136292
3,"Commercial bank branches (per 100,000 adults)",-0.0007215782
4,Individuals using the Internet (% of population),-0.0009020257
5,Employment in industry (% of total employment)...,-0.003491469
6,"Air transport, freight (million ton-km)",1.045672e-07
7,Urban population (% of total),0.001361245
8,"Air transport, passengers carried",2.986836e-11


In [8]:
urbanization_residual_ratio.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [9]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,"Manufacturing, value added (% of GDP)","Industry (including construction), value added per worker (constant 2010 US$)",Employment in services (% of total employment) (modeled ILO estimate),"Commercial bank branches (per 100,000 adults)",Individuals using the Internet (% of population),Employment in industry (% of total employment) (modeled ILO estimate),"Air transport, freight (million ton-km)",Urban population (% of total),"Air transport, passengers carried",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
11,12.666669,23036.702831,59.980000,13.890841,42.805461,30.725000,985.946746,55.169177,1.814317e+07,0.356355,0.416997
260,17.141161,27825.965488,61.408001,14.100982,57.431043,25.160000,714.003211,79.285000,4.696676e+07,0.389818,0.501202
121,14.423508,20879.217532,54.148998,11.553844,48.940434,19.660000,64.037777,63.398000,5.677816e+06,0.409757,0.425008
206,12.666669,15205.354928,57.484001,28.226565,70.330836,25.298000,7.355192,55.942000,2.442731e+06,0.349435,0.360905
91,2.955106,10629.969112,15.526000,1.012576,5.000000,3.148000,985.946746,22.677000,1.814317e+07,0.513971,0.612907
...,...,...,...,...,...,...,...,...,...,...,...
203,15.628126,15550.798552,56.962002,29.718761,67.056841,24.424999,4.737729,55.810000,2.486009e+06,0.353849,0.520270
219,6.451505,12131.027639,65.425003,23.813646,78.180775,22.334999,53.387000,88.429000,2.869266e+06,0.394839,0.324416
45,12.299308,2627.795919,38.984001,3.371190,14.119012,19.233999,985.946746,46.768000,1.814317e+07,0.458051,0.515622
439,12.666669,23036.702831,71.635002,13.890841,64.313364,21.150000,1.906885,88.183000,4.155865e+06,0.401561,0.340850


**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [10]:
urbanization_residual_value = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Dystopia residual')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [11]:
urbanization_residual_value.display_properties()

R²: 0.2525656083377493
MSE: 0.008946377167764977
Y-intercept: 0.5251184113143699
Coefficients:


Unnamed: 0,variable,coefficient
0,"Manufacturing, value added (% of GDP)",-0.0002232389
1,"Industry (including construction), value added...",-7.646115e-07
2,Employment in services (% of total employment)...,-0.001136292
3,"Commercial bank branches (per 100,000 adults)",-0.0007215782
4,Individuals using the Internet (% of population),-0.0009020257
5,Employment in industry (% of total employment)...,-0.003491469
6,"Air transport, freight (million ton-km)",1.045672e-07
7,Urban population (% of total),0.001361245
8,"Air transport, passengers carried",2.986836e-11


In [12]:
urbanization_residual_value.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [13]:
urbanization_residual_value.get_predictions_table()

Unnamed: 0,"Manufacturing, value added (% of GDP)","Industry (including construction), value added per worker (constant 2010 US$)",Employment in services (% of total employment) (modeled ILO estimate),"Commercial bank branches (per 100,000 adults)",Individuals using the Internet (% of population),Employment in industry (% of total employment) (modeled ILO estimate),"Air transport, freight (million ton-km)",Urban population (% of total),"Air transport, passengers carried",Predicted Dystopia residual,Actual Dystopia residual
272,4.088699,20335.165590,74.816002,42.970990,68.119581,17.464001,985.946746,65.806000,5.269800e+05,2.114187,2.100170
390,12.666669,23036.702831,32.223999,13.890841,42.805461,16.719999,985.946746,55.169177,1.814317e+07,1.996150,2.448000
408,14.494605,10337.645388,52.027000,13.890841,55.500000,32.931000,12.150000,68.642000,4.194174e+06,2.021802,1.890251
188,12.666669,23036.702831,67.766998,13.890841,42.805461,15.679000,985.946746,55.169177,1.814317e+07,2.160292,2.183000
326,12.666669,23036.702831,56.477001,13.890841,42.805461,18.336000,985.946746,55.169177,1.814317e+07,2.076693,2.056000
...,...,...,...,...,...,...,...,...,...,...,...
88,12.666669,1488.734888,17.934999,0.680130,4.339255,9.014000,985.946746,40.980000,1.814317e+07,2.032599,2.066005
380,12.666669,23036.702831,41.855999,13.890841,42.805461,15.001000,985.946746,55.169177,1.814317e+07,2.060064,1.750000
75,12.666669,3139.215428,6.114000,3.176675,4.866224,2.240000,985.946746,12.078000,1.814317e+07,1.860836,1.833020
155,10.669083,11179.192393,26.315001,2.533005,11.400021,6.716000,985.946746,35.793000,1.814317e+07,1.991397,1.552312


# Analysis of the Effects of Negative Indicators of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Negative Indicators of Urbanization**

In [14]:
neg_urbanization_residual_ratio = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [15]:
neg_urbanization_residual_ratio.display_properties()

R²: 0.2525656083377493
MSE: 0.008946377167764977
Y-intercept: 0.5251184113143699
Coefficients:


Unnamed: 0,variable,coefficient
0,"Manufacturing, value added (% of GDP)",-0.0002232389
1,"Industry (including construction), value added...",-7.646115e-07
2,Employment in services (% of total employment)...,-0.001136292
3,"Commercial bank branches (per 100,000 adults)",-0.0007215782
4,Individuals using the Internet (% of population),-0.0009020257
5,Employment in industry (% of total employment)...,-0.003491469
6,"Air transport, freight (million ton-km)",1.045672e-07
7,Urban population (% of total),0.001361245
8,"Air transport, passengers carried",2.986836e-11


In [16]:
neg_urbanization_residual_ratio.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [17]:
neg_urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,Employment in agriculture (% of total employment) (modeled ILO estimate),"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
199,57.834000,1254.921546,0.473745,0.362816
14,49.266998,1508.763485,0.453802,0.425424
263,12.992000,20761.228907,0.369760,0.430641
332,1.220000,10760.259503,0.342125,0.248195
345,67.080002,569.989801,0.495260,0.155592
...,...,...,...,...
37,9.726000,12295.557531,0.361966,0.411206
395,32.280998,2956.223951,0.414281,0.359326
294,31.315001,1852.476881,0.412007,0.398777
250,65.716003,1178.308480,0.492098,0.398136


**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [18]:
neg_urbanization_residual_value = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Dystopia residual')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [19]:
neg_urbanization_residual_value.display_properties()

R²: 0.2525656083377493
MSE: 0.008946377167764977
Y-intercept: 0.5251184113143699
Coefficients:


Unnamed: 0,variable,coefficient
0,"Manufacturing, value added (% of GDP)",-0.0002232389
1,"Industry (including construction), value added...",-7.646115e-07
2,Employment in services (% of total employment)...,-0.001136292
3,"Commercial bank branches (per 100,000 adults)",-0.0007215782
4,Individuals using the Internet (% of population),-0.0009020257
5,Employment in industry (% of total employment)...,-0.003491469
6,"Air transport, freight (million ton-km)",1.045672e-07
7,Urban population (% of total),0.001361245
8,"Air transport, passengers carried",2.986836e-11


In [20]:
neg_urbanization_residual_value.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [21]:
neg_urbanization_residual_value.get_predictions_table()

Unnamed: 0,Employment in agriculture (% of total employment) (modeled ILO estimate),"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Predicted Dystopia residual,Actual Dystopia residual
298,37.080002,5415.614538,2.020636,1.87877
314,14.344000,20761.228907,2.065756,2.33600
3,38.588001,20761.228907,2.022735,2.19600
320,28.375000,2503.639450,2.035177,2.73117
154,67.958000,540.069722,1.964326,2.15604
...,...,...,...,...
184,18.858000,20761.228907,2.057746,1.39200
438,7.259000,20761.228907,2.078328,2.97468
197,58.311001,1255.041886,1.981667,1.78555
43,42.202000,969.486305,2.010164,1.63328
