# Notebook 1: Data Clean Up and Pre-Selection

In [52]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd

# World Bank Development Data

Reading in and scouting the data (available from https://databank.worldbank.org/source/world-development-indicators)

In [53]:
wb_data = pd.read_csv("data/world_bank_data.csv")
wb_data.head()

#wb_data.tail()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,2019 [YR2019]
0,Australia,AUS,Access to electricity (% of population),EG.ELC.ACCS.ZS,100.0
1,Australia,AUS,Adjusted savings: carbon dioxide damage (% of ...,NY.ADJ.DCO2.GN.ZS,1.11294140526482
2,Australia,AUS,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,10.9366
3,Australia,AUS,Adolescents out of school (% of lower secondar...,SE.SEC.UNER.LO.ZS,1.97146999835968
4,Australia,AUS,"Adolescents out of school, female (% of female...",SE.SEC.UNER.LO.FE.ZS,2.19272994995117


Dropping the last five rows, because they contain only NaN values and Meta-Data, as well as the Series Code and Country Code column, because they are not needed for our analysis.

In [54]:
print(wb_data.shape)

wb_data = wb_data.dropna(subset = ["Series Name"])
wb_data = wb_data.drop(columns=["Series Code","Country Code"])

print(wb_data.shape)

(383843, 5)
(383838, 3)


Reformatting the data such that the development indicators, e.g. *Access to electricity (% of population)*, each get their own column.

In [55]:
# combining the development indicators (Series Name) into columns by country
wb_data = wb_data.set_index(["Country Name", "Series Name"], drop=True).unstack("Series Name")

# removing the upper level of the multi-index 
wb_data.columns = wb_data.columns.droplevel(0)

# adding a columns with country names for later reformatting purposes
wb_data["Country Name"] = wb_data.index

### World Happiness Report 2019

Reading in the World Happiness Report (WHR) Data (available from https://www.kaggle.com/unsdsn/world-happiness).
Since we do not plan to analyse our data over time, we focus on the 2019 Report.
We choose the year 2019 over the year 2020, because we are not interested in the effect of COVID-19 on happiness but more general correlations. Furthermore, due to COVID-19, the 2020 WHR has a lower sample size. 

In [58]:
whr_data = pd.read_csv("data/world_happiness_report_2019.csv")
whr_countries = list(whr_data["Country (region)"])

wb_countries = wb_data["Country Name"]

Country Name
Afghanistan                                    Afghanistan
Africa Eastern and Southern    Africa Eastern and Southern
Africa Western and Central      Africa Western and Central
Albania                                            Albania
Algeria                                            Algeria
                                          ...             
West Bank and Gaza                      West Bank and Gaza
World                                                World
Yemen, Rep.                                    Yemen, Rep.
Zambia                                              Zambia
Zimbabwe                                          Zimbabwe
Name: Country Name, Length: 266, dtype: object


In [None]:
intersect_countries = set(wb_countries).intersection(whr_countries)
non_intersection = set(wb_countries)^set(whr_countries)
happy_non_intersection = set(happy_country_list).intersection(non_intersection)

Länder die in den World Bank data nicht enthalten sind:
- Taiwan :(
- Cyprus

In [None]:
# list of tuples (happiness report name, world bank name)
happy_country_list_renamed = [
("Swaziland", "Eswatini"),
("Macedonia", "North Macedonia"),
("Kyrgyzstan", "Kyrgyz Republic"),
("Yemen", "Yemen, Rep."),
("Gambia", "Gambia, The"),
("Ivory Coast", "Cote d'Ivoire"),
("Venezuela", "Venezuela, RB"),
("South Korea", "Korea, Rep."),
("Laos", "Lao PDR"),
("Hong Kong", "Hong Kong SAR, China"),
("Congo (Kinshasa)", "Congo, Dem. Rep."),
("Palestinian Territories", "West Bank and Gaza"),
("Congo (Brazzaville)", "Congo, Rep."),
("Iran", "Iran, Islamic Rep."),
("Egypt", "Egypt, Arab Rep."),
("Russia", "Russian Federation"),
("Slovakia", "Slovak Republic"),
("Bosnia and Herzegovina ", "Bosnia and Herzegovina"),
("Syria", "Syrian Arab Republic")]

for happy_name, wb_name in happy_country_list_renamed:
    for i in range(0, len(happiness["Country (region)"])):
        if happiness.loc[i, "Country (region)"] == happy_name:
            happiness.loc[i, "Country (region)"] = wb_name


In [None]:
happiness = happiness[happiness["Country (region)"].str.contains("Taiwan")==False]
happiness = happiness[happiness["Country (region)"].str.contains("Cyprus")==False]

df_pivot = df_pivot[df_pivot["Country Name"].isin(happiness["Country (region)"])]
df_pivot.index = df_pivot["Country Name"]

In [None]:
df_pivot.index = df_pivot["Country Name"]

threshold = 10
delete_columns = []

for i in range(0, len(df_pivot.columns)):
    num_values = sum(df_pivot[df_pivot.columns[i]]=="..")
    #print(num_values)
    if num_values > threshold:
        delete_columns.append(df_pivot.columns[i])
        #df_pivot = df_pivot.drop(columns = df_pivot.columns[i])

In [None]:
df_pivot = df_pivot.drop(columns=delete_columns)
print(len(df_pivot.columns))
#sorted(df_pivot.columns)