In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics

In [2]:
%run -i columns.py
%run -i helper_functions.py

In [3]:
def split_X_Y(df: pd.DataFrame, output_col: str):
    cols = set(df.columns)
    independent_cols = set(URBANIZATION_INDICATORS + ANTI_URBANIZATION_INDICATORS)
    mutual = cols.intersection(independent_cols)
    independent_vars = df[list(mutual)]

    dependent_cols = list(cols - independent_cols)
    dependent_vars = df[output_col]

    return independent_vars, dependent_vars

In [4]:
def create_train_test_sets(df: pd.DataFrame, indicators: list):
    columns = ['Country', 'Country code', 'Dystopia residual', 'Residual-to-happiness ratio'] + indicators
    df = df[columns] 
    
    threshold = 0.5
    df_thresh = df.dropna(axis=1, thresh=int(df.shape[0] * threshold))

    df_impute = df_thresh.fillna(df_thresh.mean())
    df_impute
    
    train, test = train_test_split(df_impute, test_size=0.2)
    
    return train, test

In [121]:
class LinearRegressionModel():
    def __init__(self, df: pd.DataFrame, x: list, y: str):
        self.train, self.test = create_train_test_sets(df.copy(), x)
        self.df = pd.concat([self.train.copy(), self.test.copy()])
        
        self.x = set(x).intersection(self.train.columns)
        self.y = y
        
        self.model = LinearRegression()
        self.model.fit(train[self.x], train[self.y])
        self.predictions = self.model.predict(test[self.x])
        
    def get_predictions_table(self) -> pd.DataFrame:
        table = pd.DataFrame(self.test[self.x].copy())
        table[f'Predicted {self.y}'] = list(self.predictions)
        table[f'Actual {self.y}'] = list(test[self.y])
        
        return table
    
    def get_coefficients_table(self) -> pd.DataFrame:
        coefficients = pd.DataFrame(self.train[self.x].columns, columns=['variable'])
        coefficients['coefficient'] = self.model.coef_

        return coefficients
    
    def get_intercept(self) -> float:
        return self.model.intercept_
    
    def get_r_squared(self) -> float:
        return self.model.score(train[self.x], train[self.y])
    
    def get_mean_squared_error(self) -> float:
        return metrics.mean_squared_error(test[self.y], self.predictions)
    
    def get_scree_plot(self) -> alt.Chart:
        k = len(self.train[self.x].columns)
        
        scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
        pca = PCA(n_components=k).fit(scaled_values)
        
        scree_plot_data = pd.DataFrame()
        scree_plot_data['indices'] = np.arange(k) + 1
        scree_plot_data['var_explained'] = pca.explained_variance_ratio_
        
        return alt.Chart(scree_plot_data, title='Scree plot').mark_line().encode(
            x=alt.X('indices', title='Principal component'),
            y=alt.Y('var_explained', title='Variance explained')
        )
    
    def get_pca_chart(self, k: int) -> alt.Chart:        
        scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
        pca = PCA(n_components=k).fit(scaled_inputs).transform(scaled_inputs)
        
        col_names = list()
        
        for i in range(k):
            col_names.append(f'PC{i}')
        
        results = pd.DataFrame(pca, columns=col_names)
            
        results[self.y] = self.df[self.y]
        
        return alt.Chart(results).mark_point().encode(
            x='PC0',
            y=self.y
        ).interactive()
        

In [122]:
data = pd.read_csv('tmp/developing_countries.csv')

# Calculate the percentage of unexplained happiness for each happiness score
data['Residual-to-happiness ratio'] = data['Dystopia residual'] / data['Happiness score']

# Analysis of the Effects of Urbanization on the Dystopia Residual

**Prediction of Dystopia Residual-to-Happiness Ratio Using Positive Indicators of Urbanization**

In [123]:
urbanization_residual_ratio = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())
  self.model.fit(train[self.x], train[self.y])
  self.predictions = self.model.predict(test[self.x])


In [124]:
urbanization_residual_ratio.get_scree_plot()

  k = len(self.train[self.x].columns)
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  for col_name, dtype in df.dtypes.iteritems():


In [125]:
urbanization_residual_ratio.get_pca_chart(1)

  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  for col_name, dtype in df.dtypes.iteritems():


In [126]:
urbanization_residual_ratio.get_predictions_table()

  table = pd.DataFrame(self.test[self.x].copy())


Unnamed: 0,"Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Urban population (% of total),Individuals using the Internet (% of population),"Air transport, freight (million ton-km)","Industry (including construction), value added per worker (constant 2010 US$)",Employment in services (% of total employment) (modeled ILO estimate),"Air transport, passengers carried","Manufacturing, value added (% of GDP)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
337,26.625477,30.124001,53.936000,63.747282,2.804500,27243.212314,47.098999,4.425717e+06,20.851546,0.441100,0.371285
299,4.977607,11.683000,47.838000,24.500000,24.801964,17634.858318,51.237000,4.613169e+06,9.428437,0.380257,0.352723
51,13.070995,22.528000,68.393000,35.562271,9.934797,7153.970835,49.624001,4.115271e+06,10.191191,0.313733,0.300975
40,20.424837,15.152000,45.495000,44.575740,2.689022,9243.607206,69.795998,9.904990e+05,6.503924,0.337491,0.301578
45,3.371190,19.233999,46.768000,14.119012,985.946746,2627.795919,38.984001,1.814317e+07,12.299308,0.476605,0.540127
...,...,...,...,...,...,...,...,...,...,...,...
148,13.890841,18.618000,55.169177,42.805461,985.946746,23036.702831,47.520000,1.814317e+07,12.666669,0.353685,0.376911
53,13.262896,21.629999,69.080000,43.832276,8.457145,7517.632186,50.182999,3.302463e+06,10.493508,0.386954,0.302516
134,13.890841,10.873000,19.866000,15.366924,1500.147530,1606.852714,21.096001,8.242114e+06,5.577928,0.375232,0.361588
286,13.890841,15.995000,55.169177,42.805461,985.946746,23036.702831,33.865002,1.814317e+07,12.666669,0.409013,0.572493


In [127]:
urbanization_residual_ratio.get_coefficients_table()

  coefficients = pd.DataFrame(self.train[self.x].columns, columns=['variable'])


Unnamed: 0,variable,coefficient
0,"Commercial bank branches (per 100,000 adults)",-0.0007078562
1,Employment in industry (% of total employment)...,-0.003266644
2,Urban population (% of total),0.001228264
3,Individuals using the Internet (% of population),-0.0008374537
4,"Air transport, freight (million ton-km)",1.686822e-07
5,"Industry (including construction), value added...",-7.914546e-07
6,Employment in services (% of total employment)...,-0.001363755
7,"Air transport, passengers carried",2.942829e-11
8,"Manufacturing, value added (% of GDP)",-0.0009068937


In [128]:
urbanization_residual_ratio.get_r_squared()

  return self.model.score(train[self.x], train[self.y])


0.2665441048582349

In [129]:
urbanization_residual_ratio.get_mean_squared_error()

0.009479992110390882

**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [131]:
urbanization_residual_value = LinearRegressionModel(data.copy(), URBANIZATION_INDICATORS, 'Dystopia residual')

  df_impute = df_thresh.fillna(df_thresh.mean())
  self.model.fit(train[self.x], train[self.y])
  self.predictions = self.model.predict(test[self.x])


In [132]:
urbanization_residual_value.get_scree_plot()

  k = len(self.train[self.x].columns)
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  for col_name, dtype in df.dtypes.iteritems():


In [133]:
urbanization_residual_value.get_pca_chart(1)

  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  for col_name, dtype in df.dtypes.iteritems():


In [134]:
urbanization_residual_value.get_predictions_table()

  table = pd.DataFrame(self.test[self.x].copy())


Unnamed: 0,"Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Urban population (% of total),Individuals using the Internet (% of population),"Air transport, freight (million ton-km)","Industry (including construction), value added per worker (constant 2010 US$)",Employment in services (% of total employment) (modeled ILO estimate),"Air transport, passengers carried","Manufacturing, value added (% of GDP)",Predicted Dystopia residual,Actual Dystopia residual
328,31.015019,31.447001,60.178000,73.300700,155.552044,31371.951793,57.967999,5.497163e+06,18.053830,2.121123,1.595410
219,23.813646,22.334999,88.429000,78.180775,53.387000,12131.027639,65.425003,2.869266e+06,6.451505,2.001404,1.904000
238,2.318711,7.306000,36.522000,9.800000,15.025060,2960.876852,24.268999,4.864250e+05,12.666669,1.950485,1.797723
9,5.234463,31.184999,71.459000,42.945527,21.591025,28673.186365,60.471001,6.093416e+06,32.844325,1.932472,1.697584
395,12.537120,23.681000,47.694000,39.316127,2136.728479,16038.858654,44.039001,5.644764e+07,27.503507,2.017698,2.589910
...,...,...,...,...,...,...,...,...,...,...,...
202,29.766144,24.483999,55.696000,65.317025,3.217000,16140.396982,56.056999,2.427047e+06,15.621025,1.909820,1.931290
394,13.890841,7.073000,55.169177,42.805461,985.946746,23036.702831,26.576000,1.814317e+07,12.666669,1.920720,1.587000
252,7.121695,11.218000,51.089000,15.199127,985.946746,18325.909735,33.349998,2.481580e+05,12.666669,2.026154,1.694400
64,20.400619,20.868999,86.042000,60.872540,1513.832873,24653.576195,68.948997,9.414238e+07,10.783372,2.093000,2.948910


In [135]:
urbanization_residual_value.get_r_squared()

  return self.model.score(train[self.x], train[self.y])


0.07640721681527174

In [136]:
urbanization_residual_ratio.get_predictions_table()

  table = pd.DataFrame(self.test[self.x].copy())


Unnamed: 0,"Commercial bank branches (per 100,000 adults)",Employment in industry (% of total employment) (modeled ILO estimate),Urban population (% of total),Individuals using the Internet (% of population),"Air transport, freight (million ton-km)","Industry (including construction), value added per worker (constant 2010 US$)",Employment in services (% of total employment) (modeled ILO estimate),"Air transport, passengers carried","Manufacturing, value added (% of GDP)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
337,26.625477,30.124001,53.936000,63.747282,2.804500,27243.212314,47.098999,4.425717e+06,20.851546,0.441100,0.371285
299,4.977607,11.683000,47.838000,24.500000,24.801964,17634.858318,51.237000,4.613169e+06,9.428437,0.380257,0.352723
51,13.070995,22.528000,68.393000,35.562271,9.934797,7153.970835,49.624001,4.115271e+06,10.191191,0.313733,0.300975
40,20.424837,15.152000,45.495000,44.575740,2.689022,9243.607206,69.795998,9.904990e+05,6.503924,0.337491,0.301578
45,3.371190,19.233999,46.768000,14.119012,985.946746,2627.795919,38.984001,1.814317e+07,12.299308,0.476605,0.540127
...,...,...,...,...,...,...,...,...,...,...,...
148,13.890841,18.618000,55.169177,42.805461,985.946746,23036.702831,47.520000,1.814317e+07,12.666669,0.353685,0.376911
53,13.262896,21.629999,69.080000,43.832276,8.457145,7517.632186,50.182999,3.302463e+06,10.493508,0.386954,0.302516
134,13.890841,10.873000,19.866000,15.366924,1500.147530,1606.852714,21.096001,8.242114e+06,5.577928,0.375232,0.361588
286,13.890841,15.995000,55.169177,42.805461,985.946746,23036.702831,33.865002,1.814317e+07,12.666669,0.409013,0.572493


# Analysis of the Effects of Negative Indicators of Urbanization on the Dystopia Residual

In [137]:
train, test = create_train_test_sets(data.copy(), ANTI_URBANIZATION_INDICATORS)

  df_impute = df_thresh.fillna(df_thresh.mean())


**Prediction of Dystopia Residual-to-Happiness Ratio Using Negative Indicators of Urbanization**

In [139]:
neg_urbanization_residual_ratio = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Residual-to-happiness ratio')

  df_impute = df_thresh.fillna(df_thresh.mean())
  self.model.fit(train[self.x], train[self.y])
  self.predictions = self.model.predict(test[self.x])


In [140]:
neg_urbanization_residual_ratio.get_predictions_table()

  table = pd.DataFrame(self.test[self.x].copy())


Unnamed: 0,Employment in agriculture (% of total employment) (modeled ILO estimate),"Agriculture, forestry, and fishing, value added per worker (constant 2010 US$)",Predicted Residual-to-happiness ratio,Actual Residual-to-happiness ratio
292,70.436996,559.361361,0.349199,0.298068
64,10.182000,10928.566099,0.384476,0.289800
217,12.567000,6158.450895,0.405952,0.471702
165,4.900000,21938.835427,0.412365,0.453817
452,53.916000,20761.228907,0.363703,0.437262
...,...,...,...,...
310,41.671001,20761.228907,0.354385,0.337021
81,30.773001,1595.459139,0.491101,0.807006
283,51.689999,1595.748718,0.479258,0.582566
317,20.177999,6780.814928,0.347046,0.235477


In [141]:
neg_urbanization_residual_ratio.get_r_squared()

  return self.model.score(train[self.x], train[self.y])


0.16270763082314998

In [142]:
neg_urbanization_residual_ratio.get_scree_plot()

  k = len(self.train[self.x].columns)
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  for col_name, dtype in df.dtypes.iteritems():


In [143]:
neg_urbanization_residual_ratio.get_pca_chart(1)

  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  for col_name, dtype in df.dtypes.iteritems():


**Prediction of Dystopia Residual Values Using Positive Indicators of Urbanization**

In [145]:
neg_urbanization_residual_value = LinearRegressionModel(data.copy(), ANTI_URBANIZATION_INDICATORS, 'Dystopia residual')

  df_impute = df_thresh.fillna(df_thresh.mean())
  self.model.fit(train[self.x], train[self.y])
  self.predictions = self.model.predict(test[self.x])


In [146]:
neg_urbanization_residual_value.get_r_squared()

  return self.model.score(train[self.x], train[self.y])


0.015964304740829993

In [147]:
neg_urbanization_residual_value.get_scree_plot()

  k = len(self.train[self.x].columns)
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  scaled_values = StandardScaler().fit(train[self.x]).transform(train[self.x])
  for col_name, dtype in df.dtypes.iteritems():


In [148]:
neg_urbanization_residual_value.get_pca_chart(1)

  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  scaled_inputs = StandardScaler().fit(self.df[self.x]).transform(self.df[self.x])
  for col_name, dtype in df.dtypes.iteritems():
