In [1]:
import pandas as pd

# Dataset Merging

Here we will aggregate various datasets from OECD database.

In [2]:
# read public finance dataset, use this as the base dataframe
base_df = pd.read_csv('data/public_finance.csv')

# Merge Population Data

In [3]:
# read population csv to dataframe
pop_df = pd.read_csv('data/population.csv')

# rename columns
pop_df.rename(columns={'LOCATION': 'iso3', 'Time': 'year', 'Value': 'total_population'}, inplace=True)

In [4]:
# merge
base_df = base_df.merge(pop_df[['iso3', 'year', 'total_population']], on=['iso3', 'year'], how='left')

## Pivot and Merge Nonhealth Predictors

In [5]:
#read dataframe
nonhealth_df = pd.read_csv('data/nonhealth_predictors.csv')

# Pivot method
pivot_nonhealth_df = nonhealth_df.pivot(index=['COU', 'Year'], columns='Variable', values='Value').reset_index()

# If you want to have a flat DataFrame without a MultiIndex in the columns
pivot_nonhealth_df.columns.name = None  # Remove the categories name
pivot_nonhealth_df = pivot_nonhealth_df.reset_index()  # To flatten the DataFrame

In [6]:
# rename columns
# Create a dictionary of old and new column names
rename_dict = {
    'COU': 'iso3',
    'Year': 'year',
    'Alcohol consumption': 'alcohol_consume',
    'Fruits supply': 'fruit_supply',
    'Obese population, measured': 'obese_pop_measured',
    'Overweight population, measured': 'overweight_pop_measured',
    'Sugar supply': 'sugar_supply',
    'Tobacco consumption': 'tobacco_consumption',
    'Total calories supply': 'total_calories_supply',
    'Total fat supply': 'total_fat_supply',
    'Total protein supply': 'total_protein_supply',
    'Vaping population: e-cigarettes use, adults (aged 15+)': 'vape_pop_15_over',
    'Vegetables supply': 'veggie_supply'
}

# rename
pivot_nonhealth_df.rename(columns=rename_dict, inplace=True)

In [7]:
# drop index column
pivot_nonhealth_df.drop(pivot_nonhealth_df.columns[[0]], axis=1, inplace=True)

In [8]:
# Merge pivoted nonhealth df to base_df
base_df = base_df.merge(pivot_nonhealth_df, on=['iso3', 'year'], how='left')

## Merge Life Expectancy Data

In [9]:
# read life expectancy csv to dataframe
lifeexp_df = pd.read_csv('data/life_expectancy.csv')

# rename columns
lifeexp_df.rename(columns={'COU': 'iso3', 'Year': 'year', 'Value': 'life_expectancy'}, inplace=True)

In [10]:
# merge
base_df = base_df.merge(lifeexp_df[['iso3', 'year', 'life_expectancy']], on=['iso3', 'year'], how='left')

## Merge Health Expenditures Data

In [11]:
# read health expenditure csv to dataframe
healthexp_df = pd.read_csv('data/health_expenditures.csv')

# rename columns
healthexp_df.rename(columns={'LOCATION': 'iso3', 'Year': 'year', 'Value': 'health_exp_pct_gdp'}, inplace=True)

In [12]:
# merge
base_df = base_df.merge(healthexp_df[['iso3', 'year', 'health_exp_pct_gdp']], on=['iso3', 'year'], how='left')

## Merge GDP Per Capital Data

In [13]:
# read health expenditure csv to dataframe
gdp_df = pd.read_csv('data/gdp_per_capita.csv')

# rename columns
gdp_df.rename(columns={'COU': 'iso3', 'Year': 'year', 'Value': 'gdp_per_capita'}, inplace=True)

In [14]:
# merge
base_df = base_df.merge(gdp_df[['iso3', 'year', 'gdp_per_capita']], on=['iso3', 'year'], how='left')

### Save Fully Merged Dataframe

In [15]:
base_df.head()

Unnamed: 0,iso3,year,expitem1,expitem10,expitem11,expitem2,expitem3,expitem4,expitem5,expitem6,...,sugar_supply,tobacco_consumption,total_calories_supply,total_fat_supply,total_protein_supply,vape_pop_15_over,veggie_supply,life_expectancy,health_exp_pct_gdp,gdp_per_capita
0,AUS,1945,,,,,,,,,...,,,,,,,,,,
1,AUS,1946,,,,,,,,,...,,,,,,,,,,
2,AUS,1947,,,,,,,,,...,,,,,,,,,,
3,AUS,1948,,,,,,,,,...,,,,,,,,,,
4,AUS,1949,,,,,,,,,...,,,,,,,,,,


In [16]:
# finally, save end df to new .csv
base_df.to_csv('oecd_data.csv', index=False)

# Data Cleaning

Now we can start cleaning with the fully merged dataframe.

In [17]:
# read fully merged df
df = pd.read_csv('oecd_data.csv')

# Filter Rows

In [18]:
# Filter rows within the range of 1990 to 2023
df = df[(df['year'] >= 1990) & (df['year'] <= 2023)]

## Drop Columns

In [19]:
# List of columns to remove
columns_to_remove = [
    'expitem8', 'expitem9', 'expitem10', 'expitem11',
    'revitem1', 'revitem2', 'revitem3', 'revitem4', 'revitem5',
    'revitem6', 'revitem7', 'revitem8', 'revitem9', 'revitem10',
    'exch', 'gap', 'nlgq', 'nlgqa', 'ypg', 'ypga', 'yrg', 'yrga', 'ggflm'
]

# Remove the specified columns
df = df.drop(columns=columns_to_remove)

## Rename Columns

In [20]:
column_name_mapping = {
    'expitem1': 'education_exp',
    'expitem2': 'public_health_exp',
    'expitem3': 'wages_exp',
    'expitem4': 'pensions_exp',
    'expitem5': 'sickness_disability_exp',
    'expitem6': 'unemployment_exp',
    'expitem7': 'family_children_exp',
}

# Rename the columns using the dictionary
df = df.rename(columns=column_name_mapping)

# Map Country Names to ISO3

In [21]:
# assign full country names to country codes
# find unique country codes
unique_iso3_values = df['iso3'].unique()
print(unique_iso3_values)

['AUS' 'AUT' 'BEL' 'BGR' 'CAN' 'CHE' 'CHL' 'COL' 'CRI' 'CZE' 'DEU' 'DNK'
 'ESP' 'EST' 'FIN' 'FRA' 'GBR' 'GRC' 'HUN' 'IRL' 'ISL' 'ISR' 'ITA' 'JPN'
 'KOR' 'LTU' 'LUX' 'LVA' 'MEX' 'NLD' 'NOR' 'NZL' 'POL' 'PRT' 'SVK' 'SVN'
 'SWE' 'TUR' 'USA']


In [22]:
country_code_to_name = {
    'AUS': 'Australia',
    'AUT': 'Austria',
    'BEL': 'Belgium',
    'BGR': 'Bulgaria',
    'CAN': 'Canada',
    'CHE': 'Switzerland',
    'CHL': 'Chile',
    'COL': 'Colombia',
    'CRI': 'Costa Rica',
    'CZE': 'Czech Republic',
    'DEU': 'Germany',
    'DNK': 'Denmark',
    'ESP': 'Spain',
    'EST': 'Estonia',
    'FIN': 'Finland',
    'FRA': 'France',
    'GBR': 'United Kingdom',
    'GRC': 'Greece',
    'HUN': 'Hungary',
    'IRL': 'Ireland',
    'ISL': 'Iceland',
    'ISR': 'Israel',
    'ITA': 'Italy',
    'JPN': 'Japan',
    'KOR': 'South Korea',
    'LTU': 'Lithuania',
    'LUX': 'Luxembourg',
    'LVA': 'Latvia',
    'MEX': 'Mexico',
    'NLD': 'Netherlands',
    'NOR': 'Norway',
    'NZL': 'New Zealand',
    'POL': 'Poland',
    'PRT': 'Portugal',
    'SVK': 'Slovakia',
    'SVN': 'Slovenia',
    'SWE': 'Sweden',
    'TUR': 'Turkey',
    'USA': 'United States'
}

# map the country name to the corresponding iso3
df['country'] = df['iso3'].map(country_code_to_name)

In [23]:
#### CONTINENT AND COUNTRY
geo_df = pd.read_csv('data/geographical.csv')

# Rename the column in geo_df
geo_df.rename(columns={'Country Name': 'country', 'Continent': 'continent', 'Geographical Region': 'region'}, inplace=True)

# Merge the DataFrames on the 'country' column
df = pd.merge(df, geo_df[['country', 'continent', 'region']], on='country', how='left')

In [24]:
# Define a function to move a column to a specific position
def move_column(df, column_name, position):
    column = df.pop(column_name)
    df.insert(position, column_name, column)

# Move the "country" column before the "iso3" column
move_column(df, "country", df.columns.get_loc("iso3"))

# Move the "region" column after the "iso3" column
move_column(df, "region", df.columns.get_loc("iso3") + 1)

# Move the "continent" column after the "iso3" column
move_column(df, "continent", df.columns.get_loc("iso3") + 1)

# Move the "gdp_per_capita" column after the "gdp" column
move_column(df, "gdp_per_capita", df.columns.get_loc("gdp") + 1)

# Move the "health_exp_pct_gdp" column after the "year" column
move_column(df, "health_exp_pct_gdp", df.columns.get_loc("year") + 1)

# Data Transformation

In [25]:
# Define a list of expense columns to calculate percentages for
expense_columns = [
    'education_exp', 'public_health_exp', 'wages_exp',
    'pensions_exp', 'sickness_disability_exp',
    'unemployment_exp', 'family_children_exp'
]

# Calculate percentages and create new variables with "_pct_gdp" suffix
# for col in expense_columns:
#     new_col_name = col + '_pct_gdp'
#     df[new_col_name] = (df[col] / df['gdp'] * 100).round(2)
    
# Calculate percentages and create new variables with "_per_capita" suffix
# for col in expense_columns:
#     new_col_name = col + '_per_capita'
#     df[new_col_name] = (df[col] / df['total_population'] * 100).round(2)

In [26]:
# # Get the index of the "family_children_exp" column
# family_children_index = df.columns.get_loc("family_children_exp")

# # Get the newly generated columns with "_per_capita" suffix
# per_capita_columns = [col + '_per_capita' for col in expense_columns]

# # Move the per capita columns after the "family_children_exp" column
# for col in per_capita_columns:
#     df.insert(family_children_index + 1, col, df.pop(col))

# MISSING DATA AND IMPUTATION

# Save CSV

In [27]:
# Save CSV
df.to_csv('oecd_data.csv', index=False)