# Imputation and Transformation

In [1]:
# Import libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('oecd_data.csv')

## Missingness and Imputation

Here, I impute with kNN imputation, by using one-hot encoding of countries

In [3]:
# Encode categorical variables (Country), using one-hot encoding
# df_dummy = pd.get_dummies(df, columns=['iso3'], drop_first=True)

In [4]:
# separate oecd predictors from response (life expectancy)
columns_to_drop = ['cpi', 'education_exp', 'public_health_exp', 'wages_exp', 'pensions_exp', 'sickness_disability_exp', 'unemployment_exp', 'family_children_exp']
df_to_impute = df.drop(columns=columns_to_drop)

In [5]:
# check data for missingness
n_rows_with_missingness = df.isna().any(axis=1).sum()
n_cols_with_missingness = df.isna().any(axis=0).sum()

col_missingness = df_to_impute.isna().sum()

In [6]:
# display your results with this code
print('# rows with missingness:', n_rows_with_missingness)
print('# columns with missingness:', n_cols_with_missingness)
print(f'\ncolumns with missingness:\n{col_missingness}')

# rows with missingness: 1241
# columns with missingness: 24

columns with missingness:
country                       0
region                        0
health_exp_pct_gdp          100
continent                     0
iso3                          0
year                          0
gdp_per_capita               20
gdp                          22
gdpv                         24
total_population              0
alcohol_consume              30
fruit_supply                 73
obese_pop_measured         1083
overweight_pop_measured    1083
sugar_supply                 73
tobacco_consumption         563
total_calories_supply        73
total_fat_supply             73
total_protein_supply         73
vape_pop_15_over           1134
veggie_supply                73
life_expectancy              15
dtype: int64


In [7]:
variables_to_impute = [
    'life_expectancy',
    'alcohol_consume',
    'fruit_supply',
    'obese_pop_measured',
    'overweight_pop_measured',
    'sugar_supply',
    'tobacco_consumption',
    'total_calories_supply',
    'total_fat_supply',
    'total_protein_supply',
    'vape_pop_15_over',
    'veggie_supply'
]

In [8]:
df_to_impute = df_to_impute.sort_values(by=['country', 'year'])

# Loop through each country
for country in df_to_impute['country'].unique():
    # Filter the DataFrame for the current country
    country_df = df_to_impute[df_to_impute['country'] == country]

    # PLACEHOLDER: must consider more robust imputation methods
    for variable in variables_to_impute:
        # Interpolate missing values for each specified variable
        country_df[variable] = country_df[variable].interpolate(method='linear')
        # Apply LOCF
        country_df[variable].fillna(method='ffill', inplace=True)

        # Apply NOCB
        country_df[variable].fillna(method='bfill', inplace=True)

    # Update the original DataFrame with the interpolated, front-filled, and back-filled values
    df_to_impute.update(country_df)

df.update(df_to_impute)

df.to_csv('oecd_data.csv', index=False)

# IGNORE

In [9]:
# Save CSV
df.to_csv('oecd_data.csv', index=False)