In [2]:
import pandas as pd
import numpy as np

from helper.ISO3166 import ISO3166 as country_names

# Cleaning country-level data.

This notebook imports, cleans, and merges country-level data. 
It created the "country_data_all_new.csv" dataset.

9 databases were used:

1. Digital Intensity Index (dii)
2. Contribution of ICT in GDP (ict_percent) (services)
3. Percent of ICT personnel in total employment (ict_employment)
4. Percentage change of value added by ICT sector at current prices (ict_value)
5. Share of R&D personnel and researchers in government (rd_government)
6. Gross domestic expenditure on R&D by government (% of GDP) (gov_gerd)
7. Gross domestic expenditure on R&D by business (% of GDP) (buss_gerd)
8. Venture capital investment
8. GDP
9. Population
10. DESI index (Digital Economy and Society Intex)

In [9]:
# open data

dii_raw = pd.read_csv('../data/eurostat/isoc_e_diin2__custom_5232024_linear.csv')
ict_percent_raw = pd.read_csv('../data/eurostat/isoc_bde15ag__custom_5727250_linear.csv')
ict_employment_raw = pd.read_csv('../data/eurostat/isoc_bde15ap__custom_5731837_linear.csv')
ict_value_raw = pd.read_csv('../data/eurostat/isoc_bde15av__custom_5733176_linear.csv')
gov_personnel_raw = pd.read_csv('../data/eurostat/rd_p_perslf__custom_5739722_linear.csv')
gov_gerd_raw = pd.read_csv('../data/eurostat/rd_e_gerdtot__custom_5740344_linear.csv')
buss_gerd_raw = pd.read_csv('../data/eurostat/rd_e_gerdtot__custom_5740395_linear.csv')
venture_raw = pd.read_csv('../data/eurostat/RIO_VENTURE-data.csv')
gdp_raw = pd.read_csv('../data/eurostat/gdp.csv')
population_raw = pd.read_csv('../data/eurostat/tps00001_linear.csv')
desi_raw = pd.read_csv('../data/eurostat/DESI.csv')



In [10]:
# list of ternder countries

ted_countries = ["Austria", "Belgium", "Bulgaria", "Croatia", "Czechia", "Denmark", "Estonia", "Finland", 
                "France", "Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
                "Netherlands", "Norway", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", 
                "Spain", "Sweden", "Switzerland", "United Kingdom"]

ted_years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]


def clean_eurostat(df, indicator_name=None, indicator_column_name=None, indicator_names_dictionary=None, country_names_dictionary=country_names,
                   geography="geo"):
    """
    Clean eurostat data
    :param df: pandas dataframe
    :param indicator_name: name of indicator
    :param indicator_column_name: name of column with indicators
    :param indicator_names_dictionary: dictionary with indicator names
    :param country_names_dictionary: dictionary with country names
    :return: cleaned pandas dataframe
    """

    # lowercase all column names
    df.columns = map(str.lower, df.columns)

    # rename values of indicators by dictionary indicator_names_dictionary, if given, otherwise just rename indicator column

    if indicator_names_dictionary:
        df["indicator"] = df[indicator_column_name].map(indicator_names_dictionary)
    else:
        df["indicator"] = indicator_name
    
    # rename values of column geo by dictionary country_names and save as new column country, lowercase
    df["country"] = df[geography].map(country_names_dictionary).str.title()
    df.loc[df[geography] == "UK", "country"] = "United Kingdom"
    df.loc[df[geography] == "EL", "country"] = "Greece"

    # keep only rows with country in ted_countries

    df = df[df["country"].isin(ted_countries)]
    
    # keep only rows with year in ted_years

    df = df[df["time_period"].isin(ted_years)]

    # rename columns

    df = df.rename(columns={"obs_value": "value", "time_period": "year"})

    # keep only necessary columns
    df = df[["country", "year", "indicator", "value"]]

    return df




In [11]:
# indicator dictionaries

# indicator name
indicator_dict_dii = {"E_DI2_HI": "dii_v2_high", "E_DI2_LO": "dii_v2_low", "E_DI2_VHI": "dii_v2_very_high", "E_DI2_VLO": "dii_v2_very_low",
                  "E_DI3_HI": "dii_v3_high", "E_DI3_LO": "dii_v3_low", "E_DI3_VHI": "dii_v3_very_high", "E_DI3_VLO": "dii_v3_very_low",
                  "E_DI4_HI": "dii_v4_high", "E_DI4_LO": "dii_v4_low", "E_DI4_VHI": "dii_v4_very_high", "E_DI4_VLO": "dii_v4_very_low",
                  "E_DI_HI" : "dii_v1_high", "E_DI_LO" : "dii_v1_low", "E_DI_VHI": "dii_v1_very_high", "E_DI_VLO": "dii_v1_very_low"}

venture_dict = {"PC_GDP": "Percentage of GDP", "NR_COMP" : "Number of companies", "MIO_EUR" : "Million EUR"}


In [12]:
# loading data

dii = clean_eurostat(dii_raw, indicator_column_name="indic_is", indicator_names_dictionary=indicator_dict_dii)
ict_percent = clean_eurostat(ict_percent_raw, indicator_name="ict_percent")
ict_employment = clean_eurostat(ict_employment_raw, indicator_name="ict_employment")
ict_value = clean_eurostat(ict_value_raw, indicator_name="ict_value_added")
gov_personnel = clean_eurostat(gov_personnel_raw, indicator_name="gov_personnel")
gov_gerd = clean_eurostat(gov_gerd_raw, indicator_name="gov_gerd")
buss_gerd = clean_eurostat(buss_gerd_raw, indicator_name="buss_gerd")
venture = clean_eurostat(venture_raw, indicator_column_name="unit", indicator_names_dictionary=venture_dict)
gdp = clean_eurostat(gdp_raw, indicator_name="gdp")
population = clean_eurostat(population_raw, indicator_name="population")



## Data checks

In [7]:
# Printing missibg values

def print_missing_values(df, column):

    """
    Print missing values for each value in a column
    :param df: pandas
    :param column: column name
    :return: None
    """
    print(f'\nMissing values in {column} column:')

    # adding missing rows by stacking and unstacking
    df = df.pivot_table(index=["country", "indicator"], columns="year", values="value", dropna=False)
    df = df.reset_index().melt(id_vars=["country", "indicator"], var_name="year", value_name="value")
  
    for value in df[df["value"].isnull()][column].unique():
        print(f"There are {df[df[column] == value]['value'].isnull().sum()} missing values for {value}")
    
    if column == "country":
        print("\nCountries in TED dataset but not in dataset:")
        print(set(ted_countries) - set(df["country"].unique()))
    
    # if no missing values, print "no missing values"
    if df["value"].isnull().sum() == 0:
        print("No missing values")


In [8]:
# Recode dii values. In 2018, data was collected with v1 and v2; I am keeping v1.
# Then recoding indicators to have the same name for all years.

dii_selection = dii[~((dii["year"] == 2018) & (~dii["indicator"].str.startswith("dii_v2")))]

dii_selection.loc[:, "indicator"] = dii_selection["indicator"].str.replace("dii_v1", "dii")
dii_selection.loc[:, "indicator"] = dii_selection["indicator"].str.replace("dii_v2", "dii")
dii_selection.loc[:, "indicator"] = dii_selection["indicator"].str.replace("dii_v3", "dii")
dii_selection.loc[:, "indicator"] = dii_selection["indicator"].str.replace("dii_v4", "dii")

# DII missing values

print("Indicator name:  Digital Innovation Index (DII)")

print_missing_values(dii_selection, "country")
print_missing_values(dii_selection, "year")
print_missing_values(dii_selection, "indicator")

dii_wide = dii_selection.pivot_table(index=["country", "indicator"], columns="year", values="value", dropna=False)

dii_wide["dii_mean"] = round(dii_wide.mean(axis=1),1)
dii_wide["dii_sd"] = round(dii_wide.std(axis=1),1)

# select only indicator == dii_very_high

dii_wide[dii_wide.index.get_level_values("indicator") == "dii_very_high"]

Indicator name:  Digital Innovation Index (DII)

Missing values in country column:
There are 2 missing values for Greece
There are 8 missing values for United Kingdom

Countries in TED dataset but not in dataset:
{'Switzerland'}

Missing values in year column:
There are 2 missing values for 2020
There are 4 missing values for 2021
There are 4 missing values for 2022

Missing values in indicator column:
There are 3 missing values for dii_high
There are 3 missing values for dii_very_high
There are 2 missing values for dii_low
There are 2 missing values for dii_very_low


Unnamed: 0_level_0,year,2015,2016,2017,2018,2019,2020,2021,2022,dii_mean,dii_sd
country,indicator,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Austria,dii_very_high,3.0,1.8,3.7,0.7,7.6,1.5,6.0,4.5,3.6,2.2
Belgium,dii_very_high,5.9,3.5,6.4,1.3,8.6,2.8,2.3,9.1,5.0,2.8
Bulgaria,dii_very_high,1.2,0.7,1.2,0.0,1.2,0.1,0.8,1.6,0.8,0.5
Croatia,dii_very_high,2.2,2.6,2.1,0.2,4.0,1.3,3.8,5.4,2.7,1.5
Czechia,dii_very_high,3.1,2.7,3.6,0.5,6.7,0.7,3.6,4.3,3.2,1.9
Denmark,dii_very_high,10.8,12.0,9.8,2.6,12.8,5.5,10.0,13.2,9.6,3.5
Estonia,dii_very_high,3.2,2.4,2.6,0.6,3.9,1.3,2.3,4.3,2.6,1.2
Finland,dii_very_high,7.1,10.3,1.5,4.5,9.1,5.2,10.5,10.0,7.3,3.1
France,dii_very_high,1.8,1.0,2.0,0.6,2.8,0.9,0.7,2.6,1.6,0.8
Germany,dii_very_high,1.9,0.4,4.4,0.1,4.8,0.6,4.3,4.6,2.6,2.0


In [9]:
print("Indicator name:  Contribution of ICT in GDP (% of GDP) ")

print_missing_values(ict_percent, "country")
print_missing_values(ict_percent, "year")
print_missing_values(ict_percent, "indicator")

ict_pct_wide = ict_percent.pivot_table(index="country", columns="year", values="value", dropna=False)
ict_pct_wide["ict_pct_mean"] = round(ict_pct_wide.mean(axis=1),1)
ict_pct_wide["ict_pct_sd"] = round(ict_pct_wide.std(axis=1), 2)
ict_pct_wide

Indicator name:  Contribution of ICT in GDP (% of GDP) 

Missing values in country column:
There are 7 missing values for Ireland
There are 2 missing values for United Kingdom
There are 1 missing values for Estonia

Countries in TED dataset but not in dataset:
{'Switzerland'}

Missing values in year column:
There are 1 missing values for 2014
There are 1 missing values for 2015
There are 1 missing values for 2016
There are 1 missing values for 2017
There are 1 missing values for 2018
There are 2 missing values for 2019
There are 3 missing values for 2020

Missing values in indicator column:
There are 10 missing values for ict_percent


year,2014,2015,2016,2017,2018,2019,2020,ict_pct_mean,ict_pct_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austria,2.89,2.94,3.02,3.03,3.07,3.16,3.19,3.0,0.1
Belgium,3.58,3.6,3.57,3.54,3.75,3.98,4.11,3.7,0.21
Bulgaria,4.66,4.78,5.11,5.43,5.76,6.42,7.16,5.6,0.84
Croatia,3.65,3.72,3.74,3.9,4.0,4.37,4.87,4.0,0.41
Czechia,3.84,3.86,3.87,4.0,4.18,4.38,4.66,4.1,0.29
Denmark,4.2,4.26,4.34,4.22,4.44,4.32,4.44,4.3,0.09
Estonia,4.02,4.09,4.25,4.47,4.67,5.22,,4.5,0.41
Finland,4.06,4.3,4.35,4.49,4.48,4.66,4.86,4.5,0.24
France,3.58,3.63,3.72,4.09,3.88,3.93,4.28,3.9,0.23
Germany,3.78,3.8,3.7,3.78,3.96,3.98,4.08,3.9,0.13


In [10]:
print("Indicator name:  Percent of ICT personnel in total employment")

print_missing_values(ict_employment, "country")
print_missing_values(ict_employment, "year")
print_missing_values(ict_employment, "indicator")

# ict_employment to wide format

ict_employment_wide = ict_employment.pivot_table(index="country", columns="year", values="value", dropna=False)
ict_employment_wide["ict_employment_mean"] = round(ict_employment_wide.mean(axis=1),1)
ict_employment_wide["ict_employment_sd"] = round(ict_employment_wide.std(axis=1), 2)
ict_employment_wide

Indicator name:  Percent of ICT personnel in total employment

Missing values in country column:
There are 7 missing values for Ireland
There are 7 missing values for Netherlands
There are 6 missing values for United Kingdom
There are 1 missing values for Estonia

Countries in TED dataset but not in dataset:
set()

Missing values in year column:
There are 3 missing values for 2014
There are 2 missing values for 2015
There are 3 missing values for 2016
There are 3 missing values for 2017
There are 3 missing values for 2018
There are 3 missing values for 2019
There are 4 missing values for 2020

Missing values in indicator column:
There are 21 missing values for ict_employment


year,2014,2015,2016,2017,2018,2019,2020,ict_employment_mean,ict_employment_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austria,2.19,2.19,2.26,2.32,2.33,2.39,2.51,2.3,0.11
Belgium,2.48,2.46,2.51,2.55,2.72,2.82,2.81,2.6,0.15
Bulgaria,2.04,2.21,2.43,2.55,2.68,2.87,3.07,2.6,0.33
Croatia,2.0,2.08,2.13,2.24,2.31,2.56,2.72,2.3,0.24
Czechia,2.35,2.39,2.48,2.6,2.69,2.81,2.91,2.6,0.2
Denmark,3.78,3.81,3.25,3.28,3.36,3.44,3.51,3.5,0.21
Estonia,2.77,2.82,2.92,3.29,3.51,3.97,,3.2,0.43
Finland,3.03,3.06,3.14,3.28,3.34,3.43,3.6,3.3,0.19
France,2.75,2.7,2.73,2.85,2.76,2.78,2.97,2.8,0.08
Germany,2.29,2.35,2.44,2.52,2.83,2.81,2.82,2.6,0.22


In [11]:
# Percentage change of value added by ICT sector at current prices

print("Indicator name:  Percentage change of value added by ICT sector at current prices")

print_missing_values(ict_value, "country")
print_missing_values(ict_value, "year")
print_missing_values(ict_value, "indicator")

ict_value_wide = ict_value.pivot_table(index="country", columns="year", values="value", dropna=False)
ict_value_wide["ict_value_mean"] = round(ict_value_wide.mean(axis=1),1)
ict_value_wide["ict_value_sd"] = round(ict_value_wide.std(axis=1), 2)
ict_value_wide

Indicator name:  Percentage change of value added by ICT sector at current prices

Missing values in country column:
There are 1 missing values for Finland
There are 1 missing values for Slovakia
There are 2 missing values for United Kingdom
There are 1 missing values for Estonia

Countries in TED dataset but not in dataset:
{'Switzerland', 'Ireland'}

Missing values in year column:
There are 2 missing values for 2014
There are 1 missing values for 2019
There are 2 missing values for 2020

Missing values in indicator column:
There are 5 missing values for ict_value_added


year,2014,2015,2016,2017,2018,2019,2020,ict_value_mean,ict_value_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austria,4.3,1.77,2.61,0.43,1.29,2.84,1.24,2.1,1.19
Belgium,-6.09,0.39,-0.62,-0.99,6.01,6.22,3.19,1.2,4.04
Bulgaria,3.58,2.48,7.04,6.26,6.09,11.44,11.56,6.9,3.25
Croatia,-1.08,2.04,0.48,4.27,2.47,9.39,11.3,4.1,4.25
Czechia,-1.5,0.6,0.33,3.34,4.37,4.79,6.44,2.6,2.65
Denmark,-3.38,1.29,2.1,-2.78,5.03,-2.58,2.68,0.3,3.01
Estonia,6.09,1.5,4.02,5.17,4.52,11.84,,5.5,3.16
Finland,,5.91,1.09,3.25,-0.19,4.1,4.15,3.1,2.04
France,-0.28,1.36,2.44,10.05,-5.21,1.44,8.86,2.7,4.88
Germany,1.77,0.41,-2.6,2.02,4.96,0.33,2.64,1.4,2.17


In [12]:
print("Indicator name:  Share of R&D personnel and researchers in government as a share of total employment")

print_missing_values(gov_personnel, "country")
print_missing_values(gov_personnel, "year")
print_missing_values(gov_personnel, "indicator")

gov_personnel_wide = gov_personnel.pivot_table(index="country", columns="year", values="value", dropna=False)
gov_personnel_wide["gov_personnel_mean"] = round(gov_personnel_wide.mean(axis=1),1)
gov_personnel_wide["gov_personnel_sd"] = round(gov_personnel_wide.std(axis=1), 2)
gov_personnel_wide

Indicator name:  Share of R&D personnel and researchers in government as a share of total employment

Missing values in country column:
There are 4 missing values for Austria
There are 3 missing values for Greece
There are 4 missing values for Sweden
There are 3 missing values for France
There are 3 missing values for Ireland
There are 3 missing values for Luxembourg
There are 3 missing values for Switzerland
There are 2 missing values for United Kingdom
There are 1 missing values for Belgium

Countries in TED dataset but not in dataset:
set()

Missing values in year column:
There are 3 missing values for 2014
There are 1 missing values for 2015
There are 6 missing values for 2016
There are 6 missing values for 2018
There are 2 missing values for 2019
There are 8 missing values for 2020

Missing values in indicator column:
There are 26 missing values for gov_personnel


year,2014,2015,2016,2017,2018,2019,2020,gov_personnel_mean,gov_personnel_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Austria,,0.1529,,0.2317,,0.2434,,0.2,0.04
Belgium,0.1431,0.147,0.1491,0.1538,0.1655,0.1693,,0.2,0.02
Bulgaria,0.2763,0.2664,0.2705,0.2621,0.2695,0.2715,0.2758,0.3,0.01
Croatia,0.2394,0.2314,0.2271,0.2374,0.2432,0.2595,0.2736,0.2,0.02
Czechia,0.3107,0.3212,0.3179,0.3419,0.3531,0.3615,0.365,0.3,0.02
Denmark,0.1066,0.115,0.107,0.1241,0.1257,0.1255,0.1266,0.1,0.01
Estonia,0.1659,0.1674,0.1457,0.1461,0.1224,0.1217,0.13,0.1,0.02
Finland,0.2848,0.2504,0.2264,0.2401,0.2386,0.2335,0.2387,0.2,0.02
France,0.1756,,0.1777,0.1757,0.1769,,,0.2,0.01
Germany,0.2976,0.2995,0.2989,0.3071,0.3186,0.3263,0.3385,0.3,0.02


In [13]:
print("Indicator name:  Gross domestic expenditure on R&D by government (% of GDP) (gov_gerd)")

print_missing_values(gov_gerd, "country")
print_missing_values(gov_gerd, "year")
print_missing_values(gov_gerd, "indicator")

gov_gerd_wide = gov_gerd.pivot_table(index="country", columns="year", values="value", dropna=False)
gov_gerd_wide["gov_gerd_mean"] = round(gov_gerd_wide.mean(axis=1),1)
gov_gerd_wide["gov_gerd_sd"] = round(gov_gerd_wide.std(axis=1), 2)
gov_gerd_wide

Indicator name:  Gross domestic expenditure on R&D by government (% of GDP) (gov_gerd)

Missing values in country column:
There are 4 missing values for Switzerland
There are 2 missing values for United Kingdom

Countries in TED dataset but not in dataset:
set()

Missing values in year column:
There are 1 missing values for 2016
There are 1 missing values for 2018
There are 2 missing values for 2020
There are 2 missing values for 2021

Missing values in indicator column:
There are 6 missing values for gov_gerd


year,2014,2015,2016,2017,2018,2019,2020,2021,gov_gerd_mean,gov_gerd_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austria,0.14,0.14,0.22,0.22,0.22,0.23,0.24,0.24,0.2,0.04
Belgium,0.21,0.22,0.24,0.25,0.26,0.28,0.28,0.28,0.3,0.03
Bulgaria,0.2,0.2,0.16,0.17,0.17,0.21,0.22,0.21,0.2,0.02
Croatia,0.2,0.2,0.18,0.19,0.19,0.2,0.25,0.26,0.2,0.03
Czechia,0.37,0.39,0.3,0.3,0.31,0.31,0.34,0.33,0.3,0.03
Denmark,0.07,0.07,0.07,0.09,0.09,0.09,0.1,0.1,0.1,0.01
Estonia,0.16,0.16,0.14,0.15,0.16,0.17,0.17,0.16,0.2,0.02
Finland,0.27,0.23,0.22,0.23,0.23,0.23,0.22,0.22,0.2,0.02
France,0.29,0.29,0.28,0.27,0.27,0.27,0.27,0.27,0.3,0.01
Germany,0.42,0.41,0.41,0.41,0.42,0.43,0.46,0.47,0.4,0.02


In [14]:
print("Indicator name:  Gross domestic expenditure on R&D by business (% of GDP) (buss_gerd)")

print_missing_values(buss_gerd, "country")
print_missing_values(buss_gerd, "year")
print_missing_values(buss_gerd, "indicator")

buss_gerd_wide = buss_gerd.pivot_table(index="country", columns="year", values="value", dropna=False)
buss_gerd_wide["gov_gerd_mean"] = round(buss_gerd_wide.mean(axis=1),1)
buss_gerd_wide["gov_gerd_sd"] = round(buss_gerd_wide.std(axis=1), 2)
buss_gerd_wide

Indicator name:  Gross domestic expenditure on R&D by business (% of GDP) (buss_gerd)

Missing values in country column:
There are 5 missing values for Switzerland
There are 2 missing values for United Kingdom

Countries in TED dataset but not in dataset:
set()

Missing values in year column:
There are 1 missing values for 2014
There are 1 missing values for 2016
There are 1 missing values for 2018
There are 2 missing values for 2020
There are 2 missing values for 2021

Missing values in indicator column:
There are 7 missing values for buss_gerd


year,2014,2015,2016,2017,2018,2019,2020,2021,gov_gerd_mean,gov_gerd_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austria,2.2,2.18,2.19,2.14,2.16,2.2,2.23,2.22,2.2,0.03
Belgium,1.66,1.7,1.73,1.87,2.05,2.33,2.48,2.42,2.0,0.32
Bulgaria,0.52,0.7,0.56,0.52,0.54,0.56,0.57,0.51,0.6,0.06
Croatia,0.37,0.42,0.39,0.41,0.46,0.53,0.6,0.58,0.5,0.08
Czechia,1.08,1.04,1.02,1.11,1.18,1.19,1.21,1.25,1.1,0.08
Denmark,1.86,1.94,2.01,1.86,1.87,1.84,1.82,1.75,1.9,0.07
Estonia,0.62,0.68,0.64,0.6,0.6,0.87,0.96,0.98,0.7,0.15
Finland,2.13,1.91,1.79,1.78,1.81,1.84,1.95,2.06,1.9,0.12
France,1.45,1.44,1.45,1.44,1.44,1.44,1.52,1.45,1.5,0.03
Germany,1.95,2.01,2.0,2.11,2.14,2.18,2.09,2.09,2.1,0.07


In [15]:
print("Indicator name:  Venutre capital investment as a share of GDP")

# select rows where indicator == Percentage of GDP

venture = venture[venture["indicator"] == "Percentage of GDP"]

# rename "Percentage of GDP" to "venture"

venture.loc[:, "indicator"] = venture["indicator"].str.replace("Percentage of GDP", "venture")

print_missing_values(venture, "country")
print_missing_values(venture, "year")
print_missing_values(venture, "indicator")

venture_wide = venture.pivot_table(index="country", columns="year", values="value", dropna=False)
venture_wide["venture_mean"] = round(venture_wide.mean(axis=1),1)
venture_wide["venture_sd"] = round(venture_wide.std(axis=1), 2)
venture_wide

Indicator name:  Venutre capital investment as a share of GDP

Missing values in country column:

Countries in TED dataset but not in dataset:
{'Latvia', 'Estonia', 'Slovenia', 'Switzerland', 'Norway', 'Lithuania', 'Slovakia', 'Croatia'}
No missing values

Missing values in year column:
No missing values

Missing values in indicator column:
No missing values


year,2014,2015,venture_mean,venture_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Austria,0.008,0.008,0.0,0.0
Belgium,0.032,0.015,0.0,0.02
Bulgaria,0.003,0.002,0.0,0.0
Czechia,0.004,0.002,0.0,0.0
Denmark,0.075,0.109,0.1,0.02
Finland,0.049,0.047,0.0,0.03
France,0.035,0.034,0.0,0.02
Germany,0.021,0.025,0.0,0.01
Greece,0.0,0.0,0.0,0.0
Hungary,0.029,0.022,0.0,0.02


In [16]:
print("Indicator name:  GDP per capita")

print_missing_values(gdp, "country")
print_missing_values(gdp, "year")
print_missing_values(gdp, "indicator")

gdp_wide = gdp.pivot_table(index="country", columns="year", values="value", dropna=False)
gdp_wide["gdp_mean"] = round(gdp_wide.mean(axis=1),1)
gdp_wide["gdp_sd"] = round(gdp_wide.std(axis=1), 2)
gdp_wide

Indicator name:  GDP per capita

Missing values in country column:
There are 3 missing values for United Kingdom

Countries in TED dataset but not in dataset:
set()

Missing values in year column:
There are 1 missing values for 2020.0
There are 1 missing values for 2021.0
There are 1 missing values for 2022.0

Missing values in indicator column:
There are 3 missing values for gdp


year,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0,gdp_mean,gdp_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Austria,36130.0,36140.0,36390.0,36980.0,37690.0,38090.0,35480.0,36950.0,38360.0,36912.2,920.53
Belgium,33870.0,34360.0,34620.0,35050.0,35510.0,36110.0,34010.0,35950.0,36740.0,35135.6,946.95
Bulgaria,5470.0,5700.0,5910.0,6120.0,6330.0,6630.0,6410.0,6950.0,7250.0,6307.8,545.67
Croatia,10460.0,10790.0,11270.0,11770.0,12220.0,12710.0,11680.0,13500.0,14540.0,12104.4,1232.87
Czechia,15480.0,16290.0,16670.0,17490.0,17990.0,18460.0,17400.0,18020.0,18470.0,17363.3,968.87
Denmark,44890.0,45630.0,46720.0,47740.0,48450.0,48970.0,47890.0,50010.0,51460.0,47973.3,1948.1
Estonia,12960.0,13230.0,13620.0,14410.0,14920.0,15410.0,15280.0,16490.0,16250.0,14730.0,1198.44
Finland,34390.0,34460.0,35330.0,36380.0,36740.0,37150.0,36220.0,37250.0,37920.0,36204.4,1171.33
France,31320.0,31540.0,31770.0,32360.0,32800.0,33250.0,30550.0,32530.0,33230.0,32150.0,866.21
Germany,33920.0,34130.0,34610.0,35410.0,35650.0,35950.0,34590.0,35480.0,35860.0,35066.7,720.08


In [17]:
print("Indicator name:  Population")

print_missing_values(population, "country")
print_missing_values(population, "year")
print_missing_values(population, "indicator")

population_wide = population.pivot_table(index="country", columns="year", values="value", dropna=False)
population_wide["population_mean"] = round(population_wide.mean(axis=1),1)
population_wide["population_sd"] = round(population_wide.std(axis=1), 2)
population_wide

Indicator name:  Population

Missing values in country column:
There are 2 missing values for United Kingdom

Countries in TED dataset but not in dataset:
set()

Missing values in year column:
There are 1 missing values for 2021
There are 1 missing values for 2022

Missing values in indicator column:
There are 2 missing values for population


year,2014,2015,2016,2017,2018,2019,2020,2021,2022,population_mean,population_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Austria,8507786.0,8584926.0,8700471.0,8772865.0,8822267.0,8858775.0,8901064.0,8932664.0,8978929.0,8784416.3,150511.02
Belgium,11180840.0,11237274.0,11311117.0,11351727.0,11398589.0,11455519.0,11522440.0,11554767.0,11631136.0,11404823.2,141415.28
Bulgaria,7245677.0,7202198.0,7153784.0,7101859.0,7050034.0,7000039.0,6951482.0,6916548.0,6838937.0,7051173.1,128867.17
Croatia,4246809.0,4225316.0,4190669.0,4154213.0,4105493.0,4076246.0,4058165.0,4036355.0,3879074.0,4108037.8,106998.97
Czechia,10512419.0,10538275.0,10553843.0,10578820.0,10610055.0,10649800.0,10693939.0,10494836.0,10516707.0,10572077.1,63635.37
Denmark,5627235.0,5659715.0,5707251.0,5748769.0,5781190.0,5806081.0,5822763.0,5840045.0,5873420.0,5762941.0,79090.2
Estonia,1315819.0,1314870.0,1315944.0,1315635.0,1319133.0,1324820.0,1328976.0,1330068.0,1331796.0,1321895.7,6599.86
Finland,5451270.0,5471753.0,5487308.0,5503297.0,5513130.0,5517919.0,5525292.0,5533793.0,5548241.0,5505778.1,29147.74
France,66165980.0,66458153.0,66638391.0,66809816.0,67026224.0,67290471.0,67485531.0,67656682.0,67842582.0,67041536.7,537749.22
Germany,80767463.0,81197537.0,82175684.0,82521653.0,82792351.0,83019213.0,83166711.0,83155031.0,83237124.0,82448085.2,853194.0


In [35]:
# DESI

desi.head()

Unnamed: 0,country,year,indicator,value
400,Estonia,2017,desi_total,0.413375
401,Portugal,2017,desi_total,0.35479
1621,Germany,2019,desi_total,0.383493
1622,Spain,2019,desi_total,0.47037
2384,Czechia,2020,desi_total,0.395435


In [34]:
# from desi_raw, select rows where indicator == desi_total using .loc

desi_total = desi_raw.loc[desi_raw["indicator"] == "desi_total", :]

desi = clean_eurostat(desi_total, indicator_name="desi_total", geography="ref_area")

print("Indicator name:  DESI")

print_missing_values(desi, "country")
print_missing_values(desi, "year")
print_missing_values(desi, "indicator")

desi_wide = desi.pivot_table(index="country", columns="year", values="value", dropna=False)
desi_wide["desi_mean"] = round(desi_wide.mean(axis=1),1)
desi_wide["desi_sd"] = round(desi_wide.std(axis=1), 2)
desi_wide

Indicator name:  DESI

Missing values in country column:

Countries in TED dataset but not in dataset:
{'Switzerland', 'Norway', 'United Kingdom'}
No missing values

Missing values in year column:
No missing values

Missing values in indicator column:
No missing values


year,2017,2018,2019,2020,2021,2022,desi_mean,desi_sd
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Austria,0.363744,0.384273,0.412194,0.436219,0.505236,0.546757,0.4,0.07
Belgium,0.35725,0.380361,0.400032,0.442394,0.467098,0.503074,0.4,0.05
Bulgaria,0.23902,0.2579,0.280446,0.298247,0.326471,0.376799,0.3,0.05
Croatia,0.303731,0.321546,0.350614,0.370064,0.430657,0.475462,0.4,0.06
Czechia,0.318329,0.341937,0.371864,0.395435,0.4337,0.491435,0.4,0.06
Denmark,0.46479,0.486914,0.520512,0.55972,0.652503,0.693338,0.6,0.09
Estonia,0.413375,0.439832,0.465732,0.490535,0.531537,0.565123,0.5,0.05
Finland,0.478507,0.503723,0.541421,0.58426,0.631635,0.695976,0.6,0.08
France,0.338437,0.359343,0.394645,0.425334,0.459249,0.533291,0.4,0.07
Germany,0.334378,0.353,0.383493,0.42064,0.470728,0.52883,0.4,0.07


In [76]:
# missing data

uk_2020_gdp = 36340.86
uk_2021_gdp = 35958.09
uk_2022_gdp = 35575.32

# concat a row to gdp

uk_gdp = pd.DataFrame({"country": ["United Kingdom", "United Kingdom", "United Kingdom"], 
                       "indicator": ["gdp", "gdp", "gdp"],
                       "year": [2020, 2021, 2022], 
                       "value": [uk_2020_gdp, uk_2021_gdp, uk_2022_gdp]})

gdp = pd.concat([gdp, uk_gdp], axis=0)

In [77]:
# missing data population 

uk_2021_pop = 67281039
uk_2022_pop = 67508936

# append a row to population

uk_pop = pd.DataFrame({"country": ["United Kingdom", "United Kingdom"],
                        "indicator": ["population", "population"],
                        "year": [2021, 2022],
                        "value": [uk_2021_pop, uk_2022_pop]})   

population = pd.concat([population, uk_pop], axis=0).reset_index()

In [38]:
# desi missing data for uk


uk_desi = pd.DataFrame({"country": np.repeat("United Kingdom", 5),
                        "year": [2017, 2018, 2019, 2020, 2021],
                        "indicator": ["desi_total", "desi_total", "desi_total", "desi_total", "desi_total"],
                        "value": [None, 0.535, 0.566, 0.604, None]})

desi = pd.concat([desi, uk_desi], axis=0).reset_index()

# Party positions

In [43]:
# read data

legislator = pd.read_csv("../data/eurostat/MPDataset_MPDS2022a.csv")

# from legislator select only: country, date, partyabbrev, absseat, totseats, per303, per401, per411
# per303 = governmental and administrative efficiency
# per401 = free market economy
# per411 = Technology and Infrastructure: Positive

legislator = legislator[["countryname", "date", "partyabbrev", "absseat", "totseats", "per303", "per401", "per411"]]

In [44]:
legislator.head()

Unnamed: 0,countryname,date,partyabbrev,absseat,totseats,per303,per401,per411
0,Sweden,194409,SKP,15.0,230.0,0.0,0.0,0.0
1,Sweden,194409,SAP,115.0,230.0,0.0,0.0,0.0
2,Sweden,194409,FP,26.0,230.0,1.6,6.4,0.0
3,Sweden,194409,,39.0,230.0,1.8,17.5,0.0
4,Sweden,194409,,35.0,230.0,0.0,9.524,0.0


In [45]:
# rename Czech Republic to Czechia

legislator.loc[legislator["countryname"] == "Czech Republic", "countryname"] = "Czechia"

# turn column date into datetime - yyyymm

legislator["date"] = pd.to_datetime(legislator["date"], format="%Y%m")
legislator["year"] = legislator["date"].dt.year

# party position relative to seats
legislator["position_efficiency"] = legislator["absseat"] / legislator["totseats"] * legislator["per303"]
legislator["position_market"] = legislator["absseat"] / legislator["totseats"] * legislator["per401"]
legislator["position_economy"] = legislator["absseat"] / legislator["totseats"] * legislator["per411"]

# parliament position approximation

# group legislator by country and year
# sum position_efficiency, position_market, position_economy

legislator = (legislator.groupby(["countryname", "year"])
                        .agg({"position_efficiency": "sum", "position_market": "sum", "position_economy": "sum"})
                        .reset_index())

In [46]:
legislator.head()

Unnamed: 0,countryname,year,position_efficiency,position_market,position_economy
0,Albania,1991,0.0,1.5772,2.36898
1,Albania,1992,0.569429,1.559743,2.923186
2,Albania,1996,2.221671,2.246671,17.23145
3,Albania,1997,3.618323,1.540355,1.410974
4,Albania,2001,2.419857,0.756464,0.607593


In [47]:
legislator_long = legislator.melt(id_vars=["countryname", "year"], var_name="indicator", value_name="value")
legislator_wide = legislator_long.pivot_table(index=["countryname", "indicator"], columns="year", values="value", dropna=False).reset_index()
legislator_wide["2022"] = np.nan
legislator_long = legislator_wide.melt(id_vars=["countryname", "indicator"], var_name="year", value_name="value")

# group by country and indicator and forward fill

legislator_long["value_filled"] = legislator_long.groupby(["countryname", "indicator"])["value"].ffill()


In [48]:
legislator_long.head()

Unnamed: 0,countryname,indicator,year,value,value_filled
0,Albania,position_economy,1920,,
1,Albania,position_efficiency,1920,,
2,Albania,position_market,1920,,
3,Armenia,position_economy,1920,,
4,Armenia,position_efficiency,1920,,


In [49]:
# keep only countries in ted_countries

legislator_long = legislator_long[legislator_long["countryname"].isin(ted_countries)]

# keep only years >= 2013

legislator_long["year"] = legislator_long["year"].astype(int)
legislator_long = legislator_long[legislator_long["year"] >= 2013]

# drop columns value
# rename countryname to country

legislator_long = legislator_long.drop(columns="value")
legislator_long = legislator_long.rename(columns={"countryname": "country", "value_filled": "value"})

In [50]:
legislator_long.head()

Unnamed: 0,country,indicator,year,value
13449,Austria,position_economy,2013,5.260071
13450,Austria,position_efficiency,2013,3.600044
13451,Austria,position_market,2013,1.052333
13458,Belgium,position_economy,2013,3.758473
13459,Belgium,position_efficiency,2013,10.246413


In [51]:
legislator_final = legislator_long.pivot_table(index=["country", "year"], columns="indicator", values="value", dropna=False).reset_index()
legislator_final.head()

indicator,country,year,position_economy,position_efficiency,position_market
0,Austria,2013,5.260071,3.600044,1.052333
1,Austria,2014,5.260071,3.600044,1.052333
2,Austria,2015,5.260071,3.600044,1.052333
3,Austria,2016,5.260071,3.600044,1.052333
4,Austria,2017,6.870432,6.361617,1.84041


# Innovation procurement

In [52]:
# read data/eurostat/procurement

procurement = pd.read_csv("../data/eurostat/procurement.csv")
procurement.head()

Unnamed: 0,Country,Total score,S-score,Cluster
0,Finland,666,28,Strong performer
1,Austria,512,17,Good performer
2,Netherlands,455,13,Good performer
3,Belgium,424,11,Good performer
4,Sweden,409,10,Good performer


In [53]:
# add columns to procurement named as in ted_years

procurement["2013"] = procurement["Cluster"]
procurement["2014"] = procurement["Cluster"]
procurement["2015"] = procurement["Cluster"]
procurement["2016"] = procurement["Cluster"]
procurement["2017"] = procurement["Cluster"]
procurement["2018"] = procurement["Cluster"]
procurement["2019"] = procurement["Cluster"]
procurement["2020"] = procurement["Cluster"]
procurement["2021"] = procurement["Cluster"]
procurement["2022"] = procurement["Cluster"]

# Dopt columns, edit data

procurement = procurement.drop(["Total score", "S-score", "Cluster"], axis=1)
#procurement["indicator"] = "inno_procurement"
procurement.columns = procurement.columns.str.lower()

# to long format
procurement = procurement.melt(id_vars=["country"], var_name="year", value_name="inno_procurement")

In [54]:
procurement.head()

Unnamed: 0,country,year,inno_procurement
0,Finland,2013,Strong performer
1,Austria,2013,Good performer
2,Netherlands,2013,Good performer
3,Belgium,2013,Good performer
4,Sweden,2013,Good performer


# Geography

In [56]:
# read data/eurostat/geo

geo = pd.read_csv("../data/eurostat/geo.csv")
geo.head()

Unnamed: 0,Country,Geo
0,Austria,West
1,Belgium,West
2,Bulgaria,East
3,Croatia,South
4,Cyprus,South


In [57]:
# add columns to procurement named as in ted_years

geo["2013"] = geo["Geo"]
geo["2014"] = geo["Geo"]
geo["2015"] = geo["Geo"]
geo["2016"] = geo["Geo"]
geo["2017"] = geo["Geo"]
geo["2018"] = geo["Geo"]
geo["2019"] = geo["Geo"]
geo["2020"] = geo["Geo"]
geo["2021"] = geo["Geo"]
geo["2022"] = geo["Geo"]

# Dopt columns, edit data

geo = geo.drop(["Geo"], axis=1)
geo.columns = geo.columns.str.lower()

# to long format

geo = geo.melt(id_vars="country" , var_name="year", value_name="geo")

In [58]:
geo.head()

Unnamed: 0,country,year,geo
0,Austria,2013,West
1,Belgium,2013,West
2,Bulgaria,2013,East
3,Croatia,2013,South
4,Cyprus,2013,South


# OECD data

In [15]:
oecd = pd.read_csv("../data/eurostat/GOV_OECD.csv")
oecd.head()

Unnamed: 0,COU,Country,IND,Indicator,YEAR,Year,Value,Flag Codes,Flags
0,AUT,Austria,EMPGG_TOT,Employment in general goverrnment as a percent...,2013,2013,16.78,,
1,AUT,Austria,EMPGG_TOT,Employment in general goverrnment as a percent...,2014,2014,16.78,,
2,AUT,Austria,EMPGG_TOT,Employment in general goverrnment as a percent...,2015,2015,16.94,,
3,AUT,Austria,EMPGG_TOT,Employment in general goverrnment as a percent...,2016,2016,16.96,,
4,AUT,Austria,EMPGG_TOT,Employment in general goverrnment as a percent...,2017,2017,16.88,,


In [16]:
# lower case column names

oecd = oecd.drop(columns="YEAR")

oecd.columns = oecd.columns.str.lower()

# keep only counties in ted_countries

oecd = oecd[oecd["country"].isin(ted_countries)]

# if indicator column contains "employment in general government", recode as pct_employment_gov
# if indicator column contains "government procurement as share", recode as pct_gov_procurement

oecd["indicator"] = np.where(oecd["indicator"].str.contains("Employment in general goverrnment"), "pct_employment_gov",
                                np.where(oecd["indicator"].str.contains("General government procurement as share"), "pct_gov_procurement", np.nan))

# keep only country, indicator, year, value

oecd = oecd[["country", "indicator", "year", "value"]]

print_missing_values(oecd, "indicator")
print_missing_values(oecd, "country")
print_missing_values(oecd, "year")

year
2013    48
2014    48
2015    48
2016    48
2017    48
2018    48
2019    45
2020    23
Name: indicator, dtype: int64
year
2013    0.134831
2014    0.134831
2015    0.134831
2016    0.134831
2017    0.134831
2018    0.134831
2019    0.126404
2020    0.064607
Name: indicator, dtype: float64
year
2013    48
2014    48
2015    48
2016    48
2017    48
2018    48
2019    45
2020    23
Name: country, dtype: int64
year
2013    0.134831
2014    0.134831
2015    0.134831
2016    0.134831
2017    0.134831
2018    0.134831
2019    0.126404
2020    0.064607
Name: country, dtype: float64
year
2013    48
2014    48
2015    48
2016    48
2017    48
2018    48
2019    45
2020    23
Name: year, dtype: int64
year
2013    0.134831
2014    0.134831
2015    0.134831
2016    0.134831
2017    0.134831
2018    0.134831
2019    0.126404
2020    0.064607
Name: year, dtype: float64


In [17]:
oecd.head()

Unnamed: 0,country,indicator,year,value
0,Austria,pct_employment_gov,2013,16.78
1,Austria,pct_employment_gov,2014,16.78
2,Austria,pct_employment_gov,2015,16.94
3,Austria,pct_employment_gov,2016,16.96
4,Austria,pct_employment_gov,2017,16.88


In [62]:
oecd_final = oecd.pivot_table(index=["country", "year"], columns="indicator", values="value", dropna=False).reset_index()
oecd_final.head()

indicator,country,year,pct_employment_gov,pct_gov_procurement
0,Austria,2013,16.78,25.87
1,Austria,2014,16.78,25.23
2,Austria,2015,16.94,25.87
3,Austria,2016,16.96,26.4
4,Austria,2017,16.88,26.89


In [None]:
# missing

# Merging all country variables

In [78]:
# add df as new rows

df_eurostat = pd.concat([dii_selection, ict_percent, ict_employment, ict_value, gov_personnel, gov_gerd, buss_gerd, venture, gdp, population, desi]).reset_index()
df_eurostat = df_eurostat.drop(columns=["level_0", "index"])


In [79]:
df_eurostat.head()

Unnamed: 0,country,year,indicator,value
0,Austria,2018.0,dii_high,8.2
1,Austria,2020.0,dii_high,13.2
2,Belgium,2018.0,dii_high,16.2
3,Belgium,2020.0,dii_high,24.9
4,Bulgaria,2018.0,dii_high,4.1


In [80]:
# pivot df_eurostat to wide

df_eurostat_wide = df_eurostat.pivot_table(index=["country", "year"], columns="indicator", values="value", dropna=False).reset_index()

# year column to int

df_eurostat_wide["year"] = df_eurostat_wide["year"].astype(int)

df_eurostat_wide.head()

indicator,country,year,buss_gerd,desi_total,dii_high,dii_low,dii_very_high,dii_very_low,gdp,gov_gerd,gov_personnel,ict_employment,ict_percent,ict_value_added,population,venture
0,Austria,2014,2.2,,,,,,36130.0,0.14,,2.19,2.89,4.3,8507786.0,0.008
1,Austria,2015,2.18,,24.1,45.1,3.0,27.9,36140.0,0.14,0.1529,2.19,2.94,1.77,8584926.0,0.008
2,Austria,2016,2.19,,19.7,46.0,1.8,32.6,36390.0,0.22,,2.26,3.02,2.61,8700471.0,
3,Austria,2017,2.14,0.363744,22.0,36.7,3.7,37.6,36980.0,0.22,0.2317,2.32,3.03,0.43,8772865.0,
4,Austria,2018,2.16,0.384273,8.2,37.2,0.7,53.9,37690.0,0.22,,2.33,3.07,1.29,8822267.0,


In [81]:
# year column to int

procurement["year"] = procurement["year"].astype(int)
geo["year"] = geo["year"].astype(int)
legislator_final["year"] = legislator_final["year"].astype(int)
oecd_final["year"] = oecd_final["year"].astype(int)

In [67]:
procurement.head()

Unnamed: 0,country,year,inno_procurement
0,Finland,2013,Strong performer
1,Austria,2013,Good performer
2,Netherlands,2013,Good performer
3,Belgium,2013,Good performer
4,Sweden,2013,Good performer


In [68]:
geo.head()

Unnamed: 0,country,year,geo
0,Austria,2013,West
1,Belgium,2013,West
2,Bulgaria,2013,East
3,Croatia,2013,South
4,Cyprus,2013,South


In [69]:
legislator_final.head()

indicator,country,year,position_economy,position_efficiency,position_market
0,Austria,2013,5.260071,3.600044,1.052333
1,Austria,2014,5.260071,3.600044,1.052333
2,Austria,2015,5.260071,3.600044,1.052333
3,Austria,2016,5.260071,3.600044,1.052333
4,Austria,2017,6.870432,6.361617,1.84041


In [70]:
oecd_final.head()

indicator,country,year,pct_employment_gov,pct_gov_procurement
0,Austria,2013,16.78,25.87
1,Austria,2014,16.78,25.23
2,Austria,2015,16.94,25.87
3,Austria,2016,16.96,26.4
4,Austria,2017,16.88,26.89


In [82]:
# join dataframes on country and year

df = df_eurostat_wide.merge(procurement, on=["country", "year"], how="left")
df = df.merge(geo, on=["country", "year"], how="left")
df = df.merge(legislator_final, on=["country", "year"], how="left")
df = df.merge(oecd_final, on=["country", "year"], how="left")


In [83]:
df.head()

Unnamed: 0,country,year,buss_gerd,desi_total,dii_high,dii_low,dii_very_high,dii_very_low,gdp,gov_gerd,...,ict_value_added,population,venture,inno_procurement,geo,position_economy,position_efficiency,position_market,pct_employment_gov,pct_gov_procurement
0,Austria,2014,2.2,,,,,,36130.0,0.14,...,4.3,8507786.0,0.008,Good performer,West,5.260071,3.600044,1.052333,16.78,25.23
1,Austria,2015,2.18,,24.1,45.1,3.0,27.9,36140.0,0.14,...,1.77,8584926.0,0.008,Good performer,West,5.260071,3.600044,1.052333,16.94,25.87
2,Austria,2016,2.19,,19.7,46.0,1.8,32.6,36390.0,0.22,...,2.61,8700471.0,,Good performer,West,5.260071,3.600044,1.052333,16.96,26.4
3,Austria,2017,2.14,0.363744,22.0,36.7,3.7,37.6,36980.0,0.22,...,0.43,8772865.0,,Good performer,West,6.870432,6.361617,1.84041,16.88,26.89
4,Austria,2018,2.16,0.384273,8.2,37.2,0.7,53.9,37690.0,0.22,...,1.29,8822267.0,,Good performer,West,6.870432,6.361617,1.84041,16.75,27.1


In [84]:
# show only rows for austria

df[df["country"] == "Austria"]

Unnamed: 0,country,year,buss_gerd,desi_total,dii_high,dii_low,dii_very_high,dii_very_low,gdp,gov_gerd,...,ict_value_added,population,venture,inno_procurement,geo,position_economy,position_efficiency,position_market,pct_employment_gov,pct_gov_procurement
0,Austria,2014,2.2,,,,,,36130.0,0.14,...,4.3,8507786.0,0.008,Good performer,West,5.260071,3.600044,1.052333,16.78,25.23
1,Austria,2015,2.18,,24.1,45.1,3.0,27.9,36140.0,0.14,...,1.77,8584926.0,0.008,Good performer,West,5.260071,3.600044,1.052333,16.94,25.87
2,Austria,2016,2.19,,19.7,46.0,1.8,32.6,36390.0,0.22,...,2.61,8700471.0,,Good performer,West,5.260071,3.600044,1.052333,16.96,26.4
3,Austria,2017,2.14,0.363744,22.0,36.7,3.7,37.6,36980.0,0.22,...,0.43,8772865.0,,Good performer,West,6.870432,6.361617,1.84041,16.88,26.89
4,Austria,2018,2.16,0.384273,8.2,37.2,0.7,53.9,37690.0,0.22,...,1.29,8822267.0,,Good performer,West,6.870432,6.361617,1.84041,16.75,27.1
5,Austria,2019,2.2,0.412194,25.4,36.6,7.6,30.3,38090.0,0.23,...,2.84,8858775.0,,Good performer,West,5.313279,2.965557,0.453749,16.67,27.59
6,Austria,2020,2.23,0.436219,13.2,49.5,1.5,35.7,35480.0,0.24,...,1.24,8901064.0,,Good performer,West,5.313279,2.965557,0.453749,,25.12
7,Austria,2021,2.22,0.505236,22.5,36.9,6.0,34.6,36950.0,0.24,...,,8932664.0,,Good performer,West,5.313279,2.965557,0.453749,,
8,Austria,2022,,0.546757,28.7,35.0,4.5,31.8,38360.0,,...,,8978929.0,,Good performer,West,5.313279,2.965557,0.453749,,


In [85]:
df.to_csv("../data/country_data_all_new.csv", index=False)

In [31]:
# read in data

df = pd.read_csv("../data/country_data_all_new.csv")
df.head()

Unnamed: 0,country,year,buss_gerd,desi_total,dii_high,dii_low,dii_very_high,dii_very_low,gdp,gov_gerd,...,ict_value_added,population,venture,inno_procurement,geo,position_economy,position_efficiency,position_market,pct_employment_gov,pct_gov_procurement
0,Austria,2014,2.2,,,,,,36130.0,0.14,...,4.3,8507786.0,0.008,Good performer,West,5.260071,3.600044,1.052333,16.78,25.23
1,Austria,2015,2.18,,24.1,45.1,3.0,27.9,36140.0,0.14,...,1.77,8584926.0,0.008,Good performer,West,5.260071,3.600044,1.052333,16.94,25.87
2,Austria,2016,2.19,,19.7,46.0,1.8,32.6,36390.0,0.22,...,2.61,8700471.0,,Good performer,West,5.260071,3.600044,1.052333,16.96,26.4
3,Austria,2017,2.14,0.363744,22.0,36.7,3.7,37.6,36980.0,0.22,...,0.43,8772865.0,,Good performer,West,6.870432,6.361617,1.84041,16.88,26.89
4,Austria,2018,2.16,0.384273,8.2,37.2,0.7,53.9,37690.0,0.22,...,1.29,8822267.0,,Good performer,West,6.870432,6.361617,1.84041,16.75,27.1


In [28]:
from tableone import TableOne

df_descriptives = df.copy()
df_descriptives["dii"] = df_descriptives["dii_high"] + df_descriptives["dii_very_high"]
df_descriptives = df_descriptives.drop(columns=["dii_high", "dii_low", "dii_very_high", "dii_very_low", "geo", "inno_procurement"])

# devide gdp by 10000 and population by 1000000

df_descriptives["gdp"] = df_descriptives["gdp"] / 10000
df_descriptives["population"] = df_descriptives["population"] / 1000000

table_descriptives = TableOne(df_descriptives, groupby="country")

table_descriptives.to_csv("../outputs/country_data_descriptives_tableone.csv")

  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return function_base._ureduce(a,
  return function_base._ureduce(a,
  return f.format(np.nanmean(x.values), self._std(x))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [35]:
# copy only country, inno_procurment and geo columns to df_categorical

df_categorical = df[["country", "inno_procurement", "geo"]]

# keep only unique rows

df_categorical = df_categorical.drop_duplicates()

df_categorical.to_csv("../outputs/country_data_categorical.csv", index=False)


In [7]:
from helper.missing_analysis import missing_values_table

missing_values_table(df)

Your selected dataframe has 23 columns.
There are 14 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
venture,212,84.1
pct_employment_gov,111,44.0
desi_total,99,39.3
pct_gov_procurement,85,33.7
gov_personnel,82,32.5
ict_employment,77,30.6
ict_value_added,75,29.8
ict_percent,73,29.0
dii_high,39,15.5
dii_very_high,39,15.5
