In [None]:
import pandas as pd
import os

In [None]:
csv_paths = ['../data/raw/' + file for file in os.listdir('../data/raw/') if file.endswith('csv')] #read paths of all indicator csvs into list
df = None
flag = False

for path in csv_paths: 
    
    indicator_df = pd.read_csv(path,header=2).drop(['Unnamed: 67','Country Code','Indicator Code'],axis=1) #turn each indicator into dataframe  

    indicator_df = pd.melt(indicator_df,id_vars=['Country Name','Indicator Name'],value_name=indicator_df['Indicator Name'][0],var_name='Year').drop('Indicator Name',axis=1) #melt indicator into one column

    if flag:
        df = pd.merge(left=df,right=indicator_df,on=['Country Name','Year'],how='outer') #join with other indicators
    else:
        df = indicator_df
        flag = True

In [None]:
df.head()  #type: ignore

In [None]:
feature_names = ['Year',
 'Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)',
 'Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)',
 'Inflation, consumer prices (annual %)',
 'GDP (current US$)',
 'Cause of death, by injury (% of total)',
 "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",
 'Educational attainment, at least completed short-cycle tertiary, population 25+, total (%) (cumulative)',
 'Population, total',
 'Unemployment, total (% of total labor force) (modeled ILO estimate)',
 'Hospital beds (per 1,000 people)',
 'Life expectancy at birth, total (years)',
 'Physicians (per 1,000 people)',
 'Country Name',
 'Gini index'
 ]

for col in df.columns: #type:ignore
    if col.find('Poverty') > -1:
        feature_names.append(col)

df = df[feature_names] #type:ignore

In [None]:
df.head()

In [None]:
df['Country Name'] = df['Country Name'].astype('category')   #change country to category
df['Year'] =  df['Year'].astype('Int64')   #change year to int

In [None]:
from copy import deepcopy

gini = deepcopy(df[ ~df['Gini index'].isna()])

# percentage of missing values for each column
pd.Series([gini[col].isna().sum()/len(gini) for col in gini.columns],index=gini.columns).sort_values(ascending=False) 

In [None]:
len(gini)

In [None]:
def interpolate(radius):
    new_df = deepcopy(gini)

    for col in new_df.columns:
        if new_df[col].isna().sum() > 0 and not col in ['Country Name','Year']:
            
            to_impute = new_df[new_df[col].isna()]
            for i in range(len(to_impute)):
                country = to_impute['Country Name'].iloc[i]
                year = to_impute['Year'].iloc[i]
                surrounding = df[(df['Country Name'] == country) & ( abs(df['Year'] - year) <= radius)][col]
                new_df.loc[(new_df['Year'] == year) & (new_df['Country Name'] == country),col] = surrounding.mean()
                        
    print(f'radius: {radius}')                        
    return new_df

In [None]:
interpol_meta = {
    'radius':[],
    'size':[],
    'countries':[],
    'yearFrom':[],
    'yearTo':[]
}

for rad in range(1,6):
    rad_df = interpolate(rad).dropna()
    #rad_df.to_csv(f'../data/interim/radius{rad}.csv')
    interpol_meta['radius'].append(rad)
    interpol_meta['size'].append(len(rad_df))
    interpol_meta['countries'].append( len(rad_df['Country Name'].unique()) )
    interpol_meta['yearFrom'].append(rad_df['Year'].min())
    interpol_meta['yearTo'].append(rad_df['Year'].max())

In [None]:
pd.DataFrame(interpol_meta)

In [None]:
sample = interpolate(3).dropna()

sample.head()

In [None]:
sample['Country Name'].unique().tolist() #list of countries kept

In [None]:
sample['GDP per capita'] = sample['GDP (current US$)']/sample['Population, total']

sample.to_csv('../data/interim/radius3_selected_features.csv')