In [1]:
import pandas as pd
import numpy as np
from countryinfo import CountryInfo
import pycountry

KeyboardInterrupt: 

In [None]:

def add_alpha_codes_from_ISO(df, col):
    """
    adds a column of alpha3 codes to a dataframe with country name in column 'col'
    """
    input_countries = df[col]
    countries = []
    for input_country in input_countries:
        try:
            country = pycountry.countries.get(alpha_3=input_country)
            alpha2 = country.alpha_2
        except:
            alpha2 = "unk_" + str(input_country)
        countries.append(alpha2)
    df["alpha2"] = countries
    return df



In [None]:

df_input = pd.read_csv("../data/country_data_master.csv")
# df_input = pd.read_csv("../data/country_data_master.csv", index_col="alpha3")


In [None]:
iso_num_col = "ISO Code"
col = "Country"
alpha3_col = "alpha3"
alpha2_col = "alpha2"
df_original = df_input



df = add_alpha_codes_from_ISO(df_original, alpha3_col)

# filename = "scratch_new_country_data"
# df.to_csv(filename + ".csv")


Cell below appends the region, subregion, and bordering countries to the country. This is for future use for interpolation.

In [None]:
input_countries = df[alpha2_col]
bordering_countries = []
region = []
subregion = []
for input_country in input_countries:
    try:
        country = CountryInfo(input_country)
    except:
        print("couldn't find country")

    try:
        country_bordering_countries = ",".join(str(x) for x in country.borders())
    except:
        country_bordering_countries = "unk_" + str(input_country)
    
    try:
        country_region = country.region()
    except:
        country_region = "unk_" + str(input_country)

    try:
        country_subregion = country.subregion()
    except:
        country_subregion = "unk_" + str(input_country)
    


    
    bordering_countries.append(country_bordering_countries)
    region.append(country_region)
    subregion.append(country_subregion)
df["borders"] = bordering_countries
df["region"] = region
df["subregion"] = subregion

# write csv to ("../data/Country Data before Interp.csv")
df.to_csv("../data/Country Data before Interp.csv")




Loop over all the variables to interpolate, find a NaN, and then use the three staged interpolation process:
1. Take an average of all the bordering countries (if those countries have a value that could be used)
2. Take an average of the countries in that sub region 
3. Take an average of the countries in the region (continent)

In [None]:
# alternative entry point given exisiting data
df_borders = pd.read_csv("../data/processed/country_borders.csv", index_col="alpha3")

df = pd.read_csv("../data/processed/WHO_JMP_alpha_Codes_processed.csv", index_col="alpha3")

df_countries = pd.read_csv("../data/country_data_master_interpolated.csv", index_col="alpha3")

# keep only the countries country_data_master_interpolated.csv
# drop all columns, only keep the index
df_countries = df_countries.drop(df_countries.columns, axis=1)

# merge df with df countries, only keep those countries in master
df = df.merge(df_countries, how="right", left_index=True, right_index=True)


In [None]:
df_borders

Unnamed: 0_level_0,Entity,alpha2,region,subregion,borders
alpha3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABW,Aruba,AW,Americas,Caribbean,
AFG,Afghanistan,AF,Asia,Southern Asia,"IRN,PAK,TKM,UZB,TJK,CHN"
AGO,Angola,AO,Africa,Middle Africa,"COG,COD,ZMB,NAM"
AIA,Anguilla,AI,Americas,Caribbean,
ALB,Albania,AL,Europe,Southern Europe,"MNE,GRC,MKD"
...,...,...,...,...,...
WSM,Samoa,WS,Oceania,Polynesia,
YEM,Yemen,YE,Asia,Western Asia,"OMN,SAU"
ZAF,South Africa,ZA,Africa,Southern Africa,"BWA,LSO,MOZ,NAM,SWZ,ZWE"
ZMB,Zambia,ZM,Africa,Eastern Africa,"AGO,BWA,COD,MWI,MOZ,NAM,TZA,ZWE"


In [None]:
list_of_vars = ['PBO',
       'Terrain Ruggedness', 'Urban %', 'Urban Agg %', 'RoadQuality', 'Km',
       'Urb %', 'Average household size (number of members)',
       'National At Least Basic', 'National Limit (more than 30 mins)',
       'National Unimproved', 'National Surface Water',
       'Nat Accesible On Premises', 'Nat Piped', 'Nat NonPiped','Average Weight','percent_insufficient_activity']

list_of_vars = ['% urban', 'TOTALUnimproved', 'RURALUnimproved', 'URBANUnimproved',       
       'TOTALPiped', 'TOTALNon-piped', 'RURALPiped', 'RURALNon-piped', 'URBANPiped', 'URBANNon-piped']


alpha2_col = "alpha2"
alpha3_col = "alpha3"

df_input = df.merge(df_borders, how="left", on=alpha3_col)
# set index to alpha3

df_output = df_input

df_interp_track = pd.DataFrame(df_input.index)
#set index to alpha3
df_interp_track = df_interp_track.set_index(alpha3_col)
df_interp_track[list_of_vars]="none"

for variable in list_of_vars:
    # convert column to float
    df_input[variable] = df_input[variable].astype(float)

    ## for loop in the future
    #find location of nan values
    nan_locations = df_input[variable].isna()
    countries_requiring_interp = df_input[nan_locations].index

    #next for loop goes here 
    for country in countries_requiring_interp:
        #see if the borders can be used to interpolate
        borders = df_input.loc[country, "borders"]
        borders = str(borders)
        border_list = borders.split(",")
        try:
            avg_var_border = (df_input.loc[border_list, variable].mean())
        except:
            avg_var_border = np.nan
        # see if the subregion can be used to interpolate
        # find other countries in the subregion
        subregion_list = df_input[df_input["subregion"]==df_input.loc[country, "subregion"]].index
        avg_var_subregion = (df_input.loc[subregion_list, variable].mean())
        # assume the region CAN be used to interpolate
        region_list = df_input[df_input["region"]==df_input.loc[country, "region"]].index
        avg_var_region = (df_input.loc[region_list, variable].mean())
        # try border, then subregion, then region
        if ~np.isnan(avg_var_border):
            df_output.loc[country, variable] = avg_var_border
            df_interp_track.loc[country, variable] = "border interpolated"
        elif ~np.isnan(avg_var_subregion):
            df_output.loc[country, variable] = avg_var_subregion
            df_interp_track.loc[country, variable] = "subregion interpolated"
        else:
            df_output.loc[country, variable] = avg_var_region
            df_interp_track.loc[country, variable] = "region interpolated"



In [None]:
df_input.head(30)

Unnamed: 0_level_0,Unnamed: 0,Country,% urban,TOTALUnimproved,RURALUnimproved,URBANUnimproved,TOTALPiped,TOTALNon-piped,RURALPiped,RURALNon-piped,URBANPiped,URBANNon-piped,Entity,alpha2,region,subregion,borders
alpha3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ABW,,,65.470588,2.222222,7.714286,2.0,87.777778,9.888889,64.857143,25.571429,80.9,16.9,Aruba,AW,Americas,Caribbean,
AFG,0.0,Afghanistan,27.0,10.0,14.0,0.0,23.0,60.0,17.0,61.0,42.0,58.0,Afghanistan,AF,Asia,Southern Asia,"IRN,PAK,TKM,UZB,TJK,CHN"
AGO,4.0,Angola,68.0,19.0,23.0,18.0,43.0,24.0,8.0,28.0,59.0,22.0,Angola,AO,Africa,Middle Africa,"COG,COD,ZMB,NAM"
AIA,5.0,Anguilla,100.0,3.0,7.714286,3.0,100.0,0.0,64.857143,25.571429,100.0,0.0,Anguilla,AI,Americas,Caribbean,
ALB,1.0,Albania,64.0,3.0,4.0,3.0,81.0,16.0,76.0,21.0,84.0,14.0,Albania,AL,Europe,Southern Europe,"MNE,GRC,MKD"
AND,3.0,Andorra,88.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,100.0,0.0,Andorra,AD,Europe,Southern Europe,"FRA,ESP"
ARE,200.0,United Arab Emirates,88.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,100.0,0.0,United Arab Emirates,AE,Asia,Western Asia,"OMN,SAU"
ARG,7.0,Argentina,92.0,0.0,3.0,0.0,96.0,2.0,82.0,11.0,100.0,0.0,Argentina,AR,Americas,South America,"BOL,BRA,CHL,PRY,URY"
ARM,8.0,Armenia,64.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,100.0,0.0,Armenia,AM,Asia,Western Asia,"AZE,GEO,IRN,TUR"
ASM,,,36.625,0.375,0.0,0.0,95.875,3.375,95.8,3.4,98.0,2.0,American Samoa,AS,Oceania,Polynesia,


Save dataframes as CSV

In [None]:
# df_input[variable].isna()


filename = "WHOinterpTrack3"
df_interp_track.to_csv(filename + ".csv")
filename = "WHOinterp3"
df_output.to_csv(filename + ".csv")
