In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics

In [2]:
# Import dataframe column names and helper functions
%run -i columns.py
%run -i helper_functions.py

In [3]:
"""
Creates a train set and a test set from the given given dataframe instance. The name and code of the country, its
Dystopia residual value, and its Dystopia residual-to-happiness score ratio are always present in the resulting datasets. 
Any other variable to be included in the dataset must be named in the given list of indicators.

Parameters:
    df (pd.DataFrame): The dataframe of developing countries to process and split into train and test sets
    indicators (list): The list of positive or negative indicators of urbanization to include in the resulting datasets

Returns: two dataframes, a train set which contains 80% of the original dataframe's entries, and a test set containing
         the remaining entries
"""
def create_train_test_sets(df: pd.DataFrame, indicators: list) -> (pd.DataFrame, pd.DataFrame):
    columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + indicators
    df = df[columns] 
    
    # Discard any indicators for which more than 50% of the dataframe's entries are missing data
    threshold = 0.5
    df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))
    
    # Impute the remaining NaN values using the mean imputation method
    df_impute = df_thresh.fillna(df_thresh.mean())
    
    train, test = train_test_split(df_impute, test_size=0.2)
    
    return train, test

In [4]:
"""
A multiple linear regression model.
"""
class LinearRegressionModel():
    """
    Creates a new model using the given dataframe, in which the independent variables are in a list of variable names, x, and
    the name of the dependent variable is a string, y.
    """
    def __init__(self, df: pd.DataFrame, x: list, y: str):
        # Prepare the train and test datasets to be used by the model
        self.train, self.test = create_train_test_sets(df.copy(), x)
        self.df = pd.concat([self.train.copy(), self.test.copy()])
        
        self.x = list(set(x).intersection(self.train.columns))
        self.y = y
        
        # Train the model using the train set, then calculate the predicted outputs using the test set
        self.model = LinearRegression()
        self.model.fit(self.train[self.x], self.train[self.y])
        self.predictions = self.model.predict(self.test[self.x])
        
    """
    Returns a copy of the test dataframe which contains the predicted and actual values of the dependent variable
    for all entries.
    """
    def get_predictions_table(self) -> pd.DataFrame:
        table = pd.DataFrame(self.test[self.x].copy())
        table[f'Predicted {self.y}'] = list(self.predictions)
        table[f'Actual {self.y}'] = list(self.test[self.y])
        
        return table
    
    """
    Returns a dataframe that contains the coefficients of all independent variables.
    """
    def get_coefficients_table(self) -> pd.DataFrame:
        coefficients = pd.DataFrame(self.train[self.x].columns, columns=['variable'])
        coefficients['coefficient'] = self.model.coef_

        return coefficients
    
    """
    Returns the y-intercept of the model.
    """
    def get_intercept(self) -> float:
        return self.model.intercept_
    
    """
    Returns the R^2 value of the model.
    """
    def get_r_squared(self) -> float:
        return self.model.score(self.train[self.x], self.train[self.y])
    
    """
    Returns the mean squared error of the model's predictinos.
    """
    def get_mean_squared_error(self) -> float:
        return metrics.mean_squared_error(self.test[self.y], self.predictions)
    
    """
    Displays the R^2 value, MSE, y-intercept, and coefficients of the model.
    """
    def display_properties(self) -> pd.DataFrame:
        print("R\u00B2:", self.get_r_squared())
        print("MSE:", self.get_mean_squared_error())

        print("Y-intercept:", self.get_intercept())
        print("Coefficients:")
        return self.get_coefficients_table()
    
    """
    Performs PCA on the train dataset, and returns a scree plot.
    """
    def get_scree_plot(self) -> alt.Chart:
        # Set K to be the number of independent variables
        k = len(self.train[self.x].columns)
        
        # Scale the independent variables, and perform PCA using them
        scaled_values = StandardScaler().fit(self.train[self.x]).transform(self.train[self.x])
        pca = PCA(n_components=k).fit(scaled_values)
        
        # Produce a dataframe for the scree plot
        scree_plot_data = pd.DataFrame()
        scree_plot_data['indices'] = np.arange(k) + 1
        scree_plot_data['var_explained'] = pca.explained_variance_ratio_
        
        return alt.Chart(scree_plot_data, title='Scree plot').mark_line().encode(
            x=alt.X('indices', title='Principal component'),
            y=alt.Y('var_explained', title='Variance explained')
        )
    
    """
    Creates a scree plot and a 2D visualization of the data using PCA with a K value of 1.
    """
    def visualize(self) -> alt.Chart:
        # Scale the inputs of the dataframe, and perform PCA with K = 1
        scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
        pca = PCA(n_components=1).fit(scaled_inputs).transform(scaled_inputs)
        
        # Create a dataframe for the PCA results
        results = pd.DataFrame(pca, columns=['PC0'])
        results[self.y] = self.df[self.y]
        results['Country'] = self.df['Country code']
        
        # Produce a scatter plot of the principal component vs. the Dystopia residual-to-happiness ratio
        chart = alt.Chart(results, title=f'PC0 vs {self.y}').mark_point().encode(
            x='PC0',
            y=self.y,
            color='Country:N',
            tooltip=['Country', 'PC0', self.y]
        )
        
        # Produce a linear regression for the scatter plot
        # Based on https://stackoverflow.com/questions/66604052/altair-extract-and-display-regression-coefficients
        regression = chart.transform_regression('PC0', self.y).mark_line(color='black')
        parameters = chart.transform_regression('PC0', self.y, params=True).transform_calculate(
            intercept='datum.coef[0]',
            slope='datum.coef[1]',
        ).mark_text(align='left', color='black').encode(
            x=alt.value(300), 
            y=alt.value(20),
            text='slope:Q',
            tooltip=[alt.Tooltip('slope:Q', title='Slope'), alt.Tooltip('intercept:Q', title='Intercept')]
        )
        
        return self.get_scree_plot() | alt.layer(chart, regression, parameters).resolve_scale(
            color='independent'
        ).interactive()
        

In [5]:
data = pd.read_csv('tmp/developing_countries.csv')

# Calculate the percentage of unexplained happiness for each happiness score
data['Residual-to-happiness ratio'] = data['Dystopia residual'] / data['Happiness score']

# Analysis of the Effects of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Positive Indicators of Urbanization**

In [6]:
urbanization_residual_ratio = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [7]:
urbanization_residual_ratio.display_properties()

R²: 0.26955236350526945
MSE: 0.008950195827393564
Y-intercept: 0.5302881901136032
Coefficients:


Unnamed: 0,variable,coefficient
0,Individuals using the Internet (% of population),-0.001022973
1,"Commercial bank branches (per 100,000 adults)",-0.00089397
2,"Air transport, passengers carried",-5.989313e-11
3,"Air transport, freight (million ton-km)",1.315919e-06
4,"Industry (including construction), value added...",-7.876177e-07
5,Employment in industry (% of total employment)...,-0.00349005
6,"Manufacturing, value added (% of GDP)",0.0005651692
7,Urban population (% of total),0.001560715
8,Employment in services (% of total employment)...,-0.00146215


In [8]:
urbanization_residual_ratio.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [9]:
urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,Individuals using the Internet (% of population),"Commercial bank branches (per 100,000 adults)","Air transport, passengers carried","Air transport, freight (million ton-km)","Industry (including construction), value added per worker (constant 2010 US$)",Employment in industry (% of total employment) (modeled ILO estimate),"Manufacturing, value added (% of GDP)",Urban population (% of total),Employment in services (% of total employment) (modeled ILO estimate),Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
170,29.547163,14.056147,1.195778e+08,1893.881500,6167.172991,24.289000,15.199902,33.182000,30.587000,0.408855,0.495077
146,34.666001,7.128309,3.595160e+05,0.846429,5125.335603,18.601000,11.122444,54.749000,46.699001,0.442930,0.514546
133,13.855176,13.890841,7.074779e+06,1228.738320,1478.797267,9.949000,4.404619,19.428000,21.152000,0.470886,0.542722
310,42.805461,13.890841,1.814317e+07,985.946746,23036.702831,23.642000,12.666669,55.169177,34.686001,0.416182,0.534174
281,20.774000,4.139168,5.698290e+05,3.928770,2999.410963,7.774000,9.027016,35.455000,20.304001,0.506563,0.517725
...,...,...,...,...,...,...,...,...,...,...,...
112,69.803100,33.600583,1.782666e+06,0.775320,30000.004192,26.712000,12.504204,56.155000,64.059998,0.312927,0.478232
389,21.960000,13.890841,7.963830e+05,3.950726,7488.187230,16.638000,12.666669,26.982000,32.080002,0.433762,0.366802
164,42.805461,13.890841,1.814317e+07,985.946746,23036.702831,20.502001,12.666669,55.169177,47.587002,0.408278,0.422783
30,95.878136,13.890841,5.190484e+06,390.390000,45818.016765,35.278999,18.520359,89.186000,63.674000,0.317340,0.272080


# Analysis of the Effects of Negative Indicators of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Negative Indicators of Urbanization**

In [14]:
neg_urbanization_residual_ratio = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())


In [15]:
neg_urbanization_residual_ratio.display_properties()

R²: 0.1915397631910868
MSE: 0.010496814066344897
Y-intercept: 0.33257731904030047
Coefficients:


Unnamed: 0,variable,coefficient
0,Employment in agriculture (% of total employme...,0.002220869
1,"Agriculture, forestry, and fishing, value adde...",1.001758e-07


In [16]:
neg_urbanization_residual_ratio.visualize()

  for col_name, dtype in df.dtypes.iteritems():


In [17]:
neg_urbanization_residual_ratio.get_predictions_table()

Unnamed: 0,Employment in agriculture (% of total employment) (modeled ILO estimate),"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
50,56.755001,20761.228907,0.460703,0.265250
297,31.000999,20761.228907,0.403506,0.421918
111,12.455000,20761.228907,0.362318,0.411482
359,72.452003,20761.228907,0.495564,0.705463
424,15.408000,4646.074911,0.367262,0.198824
...,...,...,...,...
319,28.261999,2478.681201,0.395592,0.445484
101,26.770000,20761.228907,0.394110,0.302516
184,18.858000,20761.228907,0.376538,0.312388
161,28.722000,2184.958697,0.396584,0.385146
