# Dimensionality reduction of world happiness 2015 dataset using UMAP 

This notebook was executed on Google Colab.

## Install and import the required modules.

In [252]:
! pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [253]:
from bokeh.palettes import brewer
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import HoverTool, PanTool, ResetTool, BoxZoomTool
from bokeh.models import Range1d
from bokeh.models import Legend, LegendItem
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from umap import UMAP
output_notebook()

## Import data

In [254]:
data_2015 = pd.read_csv('Data/world_happiness_2015.csv')

In [255]:
data_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [256]:
data_2015.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


## Preprocess data

Define a class to extract data from a pandas DataFrame.

In [257]:
class FeatureSelector(BaseException, TransformerMixin):
    def __init__(self, feature_names):
        self._feature_names = feature_names
    def fit(self, X, Y=None):
        return self
    def transform(self, X):
        return X[self._feature_names].values

Define the names of the columns that hold numerical data.

In [258]:
num_attr_names = ['Economy (GDP per Capita)',
                  'Family', 'Health (Life Expectancy)', 'Freedom',
                  'Trust (Government Corruption)', 'Generosity',
                  'Dystopia Residual']

Create a pipeline for the numerical attributes, rescaling them after selection.

In [259]:
num_attrs_pipeline = Pipeline([
    ('select_num_attrs', FeatureSelector(num_attr_names)),
    ('scaler', MinMaxScaler()),
])

Run the pipeline to prepare the data.

In [260]:
prepared_data = num_attrs_pipeline.fit_transform(data_2015)

In [261]:
prepared_data.shape

(158, 7)

In [262]:
prepared_data

array([[0.82613197, 0.96240274, 0.91824433, ..., 0.76059502, 0.37289541,
        0.66862987],
       [0.77041209, 1.        , 0.92449646, ..., 0.25629179, 0.54819822,
        0.72503024],
       [0.78411282, 0.97029731, 0.85309924, ..., 0.87617546, 0.42894657,
        0.66088906],
       ...,
       [0.39232853, 0.33866769, 0.70415021, ..., 0.34255585, 0.59279037,
        0.        ],
       [0.00905101, 0.29657759, 0.21844428, ..., 0.18231233, 0.247864  ,
        0.45957306],
       [0.12344861, 0.09980531, 0.27742502, ..., 0.19443388, 0.2095919 ,
        0.37838928]])

## Helper functions

In [263]:
used_markers = ['hex_dot', 'circle', 'diamond', 'triangle', 'inverted_triangle', 'hex', 'plus', 'square', 'star', 'circle_cross']
regions = pd.unique(data_2015['Region']).tolist()
def to_marker(region):
  return used_markers[regions.index(region)]

In [264]:
def plot_preprocess(umap_data):
  umap_data.columns = ['C1', 'C2']
  umap_data['Country'] = data_2015['Country']
  umap_data['Happiness Score'] = data_2015['Happiness Score']
  happiness_min = data_2015['Happiness Score'].min()
  happiness_max = data_2015['Happiness Score'].max()
  nr_colors = int(happiness_max) - int(happiness_min) + 1
  umap_data['Happiness Color'] = pd.cut(data_2015['Happiness Score'],
                                     bins=np.linspace(np.floor(happiness_min),
                                                      np.ceil(happiness_max),
                                                      nr_colors + 1),
                                     labels=brewer['RdBu'][nr_colors])
  umap_data['markers'] = data_2015['Region'].map(to_marker)
  return umap_data

(I had to get pretty hacky implementing the legends below, it is my first time using bokeh)

In [265]:
def plot_umap(umap_data):
  hovertool = HoverTool(tooltips=[('Country', '@Country')])
  fig = figure(tools=[PanTool(), BoxZoomTool(), hovertool, ResetTool()],
             plot_width=500, plot_height=400)
  f = fig.scatter('C1', 'C2', source=umap_data, fill_color='Happiness Color', marker = "markers",
              size=8, alpha=0.85)
  r = fig.scatter(x=np.ones(len(used_markers)), y=np.ones(len(used_markers)), marker=used_markers)
  r.visible = False
  legend = Legend(items=[LegendItem(label=regions[i], renderers=[r], index=i)for i in range(len(used_markers))])
  fig.add_layout(legend, 'right')
  r2 = fig.scatter(x=[0,0], y=[0,0], marker= ["asterisk", "asterisk"],color=['#b2182b', '#2166ac'])
  r2.visible = False
  legend2 = Legend(items=[LegendItem(label="red = happy", renderers=[r2], index=0),
                          LegendItem(label="blue = unhappy", renderers=[r2], index=1)])
  fig.add_layout(legend2, 'above')
  fig.x_range = Range1d(min(umap_data['C1'])-1, max(umap_data['C1'])+1)
  fig.y_range = Range1d(min(umap_data['C2'])-1, max(umap_data['C2'])+1)
  return fig

## UMAP

We apply the UMAP algorithm to our dataset with the default settings, except for the random state for reproducibility.

In [266]:
umap_data = pd.DataFrame(UMAP(random_state=42).fit_transform(prepared_data))
umap_data = plot_preprocess(umap_data)

The countries are projected onto a zig-zag pattern.
The top left contains a cluster of very happy countries, mostly consisting of the first world countries. These are connected to moderately happy countries in the center, with many Latin American countries. To the right of those there are many unhappy Eastern European countries and at the bottom there is a clear cluster of very unhappy Sub-Saharan African countries. We can vaguely see a 2-simplex in the first world, Latin-American and Eastern European countries. We will track how this structure evolves when changing the hyperparameters. In this simplex, I ignore the presence of many Asian countries to keep the analysis brief.

In [267]:
fig = plot_umap(umap_data)
show(fig)

## Hyperparameter Exploration

We explore the two hyperparameters mentioned in the lecture. First, we change the number of neighbors.

In [268]:
umap_data2 = pd.DataFrame(UMAP(n_neighbors=50,random_state=42).fit_transform(prepared_data))
umap_data2 = plot_preprocess(umap_data2)

Increasing the number of neighbors makes the projection lose its zig-zag shape.
Now the happy countries are at the top right and the unhappy countries on the bottom left. This is logical, since increasing the number of neighbors makes the algorithm focus more on global structure and less on local structure. The important global structure in this dataset seems to map nicely to the happiness score.

In [269]:
fig2 = plot_umap(umap_data2)
show(fig2)

In [270]:
umap_data3 = pd.DataFrame(UMAP(n_neighbors=5,random_state=42).fit_transform(prepared_data))
umap_data3 = plot_preprocess(umap_data3)

Decreasing the number of neighbors makes the projection lose its zig-zag shape.
The projected data now lies on a crescent shape with the first world countries on one end and the Sub-Saharan African countries on the other end. The simplex between first world, Eastern Europe and Latin America still seems to be present. But first world and Sub-Saharan Africa are closer together now. This seems logical, since decreasing the number of neighbors makes the algorithm focus more on local structure and less on global structure.

In [271]:
fig3 = plot_umap(umap_data3)
show(fig3)

Next, we change the minimum distance hyperparameter.

In [272]:
umap_data4 = pd.DataFrame(UMAP(min_dist=0.01,random_state=42).fit_transform(prepared_data))
umap_data4 = plot_preprocess(umap_data4)

Decreasing the value of the minimum distance seems to have little effect.

In [273]:
fig4 = plot_umap(umap_data4)
show(fig4)

In [274]:
umap_data5 = pd.DataFrame(UMAP(min_dist=0.5,random_state=42).fit_transform(prepared_data))
umap_data5 = plot_preprocess(umap_data5)

Increasing the value of the minimum distance causes the data to become much more spread out. The simplex between first world, Latin America and Eastern Europe is no longer present. These clusters are now present top to bottom, with Sub-Saharan countries to the right.

In [275]:
fig5 = plot_umap(umap_data5)
show(fig5)

We can conclude that the default values for UMAP seem to be sane choices.