# Imputation and Transformation

In [1]:
# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('oecd_data.csv')

## Missingness and Imputation

Here, I impute with kNN imputation, by using one-hot encoding of countries

### Delete Countries with a lot of missing data

Some of them have a ton of missing public expenditure data.

In [3]:
# Assuming your DataFrame is named 'df'
countries_to_remove = ['Turkey', 'New Zealand', 'Mexico', 'Japan', 'Costa Rica', 'Colombia', 'Chile', 'Canada']
df = df[~df['country'].isin(countries_to_remove)]

## Remove raw public finance columns

In [4]:
columns_to_remove = [
    'public_health_exp', 'wages_exp', 'pensions_exp', 'sickness_disability_exp',
    'unemployment_exp', 'family_children_exp', 'subsidies_exp', 'public_investment_exp',
    'personal_income_tax', 'property_income', 'social_security_contrib',
    'corporate_income_tax', 'environmental_tax', 'other_consumption_tax',
    'immovable_property_tax', 'other_property_tax', 'sales_goods_services',
    'other_nonproperty_tax',
    'education_exp', 'other_primary_exp', 'property_income_exp'
]

df = df.drop(columns=columns_to_remove)

df_to_impute = df

In [5]:
# check data for missingness
n_rows_with_missingness = df.isna().any(axis=1).sum()
n_cols_with_missingness = df.isna().any(axis=0).sum()

col_missingness = df_to_impute.isna().sum()

In [6]:
# display your results with this code
print('# rows with missingness:', n_rows_with_missingness)
print('# columns with missingness:', n_cols_with_missingness)
print(f'\ncolumns with missingness:\n{col_missingness}')

# rows with missingness: 949
# columns with missingness: 45

columns with missingness:
country                                           0
region                                            0
health_exp_pct_gdp                               61
continent                                         0
iso3                                              0
year                                              0
gdp_per_capita                                   20
gdp                                              21
gdp_volume_market_prices                         23
cpi                                             582
exchange_rate_usd                                16
total_economy_output_gap                         75
gov_net_lending_gdp_percentage                   47
gov_net_lending_adj_gdp_percentage               80
current_disbursements_gen_gov                    47
cyclically_adj_current_disbursements_gen_gov     75
current_receipts_gen_gov                         47
cyclically_adj_current_receip

In [7]:
variables_to_impute = [
    'life_expectancy',
    'alcohol_consume',
    'fruit_supply',
    'obese_pop_measured',
    'overweight_pop_measured',
    'sugar_supply',
    'tobacco_consumption',
    'total_calories_supply',
    'total_fat_supply',
    'total_protein_supply',
    #'vape_pop_15_over',
    'veggie_supply'
]

In [8]:
df_to_impute = df.sort_values(by=['country', 'year'])

# Loop through each country
for country in df_to_impute['country'].unique():
    # Filter the DataFrame for the current country
    country_df = df_to_impute[df_to_impute['country'] == country]

    # PLACEHOLDER: must consider more robust imputation methods
    for variable in variables_to_impute:
        # Interpolate missing values for each specified variable
        country_df[variable] = country_df[variable].interpolate(method='linear')
        # Apply LOCF
        country_df[variable].fillna(method='ffill', inplace=True)

        # Apply NOCB
        country_df[variable].fillna(method='bfill', inplace=True)

    # Update the original DataFrame with the interpolated, front-filled, and back-filled values
    df_to_impute.update(country_df)

df.update(df_to_impute)

# IGNORE

In [9]:
# Save CSV
df.to_csv('oecd_data_cleaned.csv', index=False)