# Preparation of other predictor variables

@author: Caroline Gasten

The present script prepares four yearly variables which could not be obtained from the WPS DataCube. Instead they were downloaded from their original sources (see thesis for more detailed explanation) and stored in a directory for raw predictor variables. In the following each of the variables is prepared and then stored together in a dataframe.

## Settings

In [None]:
#import packages
import pandas as pd
import os
import numpy as np

In [None]:
#required paths
path_input=#path to raw predictor variables
path_output =#path to store prepared predictor variables

In [None]:
#countries of wider study area for RF Model
countries = ['KEN', 'ETH', 'UGA', 'SSD', 'SOM', 'SDN']
countries_long = ["Kenya", "Ethiopia", "Uganda", 'South Sudan', 'Somalia']

## sanitationaccess

In [None]:
#read JMP files for each country and retrieve data for 'at least basic' sanitation access
df_san_eth = pd.read_csv(os.path.join(path_input, "JMP_2021_ETH_Ethiopia.csv"), delimiter=';', header=[3]).loc[:, ["Country", "Year", "Setting","At least basic (improved and not shared)"]]
df_san_ken = pd.read_csv(os.path.join(path_input, "JMP_2021_KEN_Kenya.csv"), delimiter=';', header=[3]).loc[:, ["Country", "Year", "Setting","At least basic (improved and not shared)"]]
df_san_sdn = pd.read_csv(os.path.join(path_input, "JMP_2021_SDN_Sudan.csv"), delimiter=';', header=[3]).loc[:, ["Country", "Year", "Setting","At least basic (improved and not shared)"]]
df_san_som =  pd.read_csv(os.path.join(path_input, "JMP_2021_SOM_Somalia.csv"), delimiter=';', header=[3]).loc[:, ["Country", "Year", "Setting","At least basic (improved and not shared)"]]
df_san_ssd = pd.read_csv(os.path.join(path_input, "JMP_2021_SSD_South_Sudan.csv"), delimiter=';', header=[3]).loc[:, ["Country", "Year", "Setting","At least basic (improved and not shared)"]]
df_san_uga = pd.read_csv(os.path.join(path_input, "JMP_2021_UGA_Uganda.csv"), delimiter=';', header=[3]).loc[:, ["Country", "Year", "Setting","At least basic (improved and not shared)"]]

In [None]:
#fill data before 2011 for South Sudan with values from Sudan
df_san_ssd.iloc[df_san_ssd.Year.isin(range(2011)), 3]=df_san_sdn.loc[df_san_sdn.Year.isin(range(2011)), "At least basic (improved and not shared)"]

In [None]:
#combine dataframes for all countries
df_san = pd.concat([df_san_eth, df_san_ken, df_san_uga, df_san_ssd, df_san_som])

In [None]:
#retrieve "National" sanitation access values and simplify dataframe
df_san_ss = df_san[df_san.Setting=="National"].drop(columns="Setting").rename(columns={"At least basic (improved and not shared)":"sanitationaccess"})

In [None]:
#lag variable by 4 years (as done by Kuzma et al. (2020))
df_san_ss.Year=df_san_ss.Year+4

In [None]:
#initialize dataframe with yearly variables with "sanitationaccess" as first variable
df_yearlyvars = df_san_ss

## sex_ratio_25-64, sex_ratio_65+

In [None]:
#open WPP dataset on sex ratio per selected age group
df_sexratio = pd.read_csv(os.path.join(path_input, 'WPP2022_POP_F04_SEX_RATIO_SELECT_AGE_GROUPS.csv'), delimiter=';', skiprows=16)

In [None]:
#retrieve data on sex ratios for age groups 25-64 and 65+ for the region of interest and the years of 2004 or later
df_sexratio_ss = df_sexratio[(df_sexratio["Region, subregion, country or area *"].isin(countries_long)) & (df_sexratio.Year>=2004) & (df_sexratio.Year<=2021)].loc[:, ['Region, subregion, country or area *', "Year", "25-64", "65+"]]
df_sexratio_ss.rename(columns={'Region, subregion, country or area *': "Country", "25-64":"sex_ratio_25-64", "65+":"sex_ratio_65+"}, inplace=True)

In [None]:
#add data to dataframe with yearly variables
df_yearlyvars = df_yearlyvars.join(df_sexratio_ss.set_index(["Country","Year"]), on=["Country", "Year"])

## male_pct_65+

In [None]:
#open WPP dataset on population percentage per age group
df_malepct = pd.read_csv(os.path.join(path_input, 'WPP2022_POP_F06_2_POPULATION_PERCENTAGE_SELECT_AGE_GROUPS_MALE.csv'), delimiter=';', skiprows=16)

In [None]:
#rerieve data on percentage of male population which is 65+
df_malepct_ss = df_malepct[(df_malepct["Region, subregion, country or area *"].isin(countries_long)) & (df_malepct.Year>=2004) & (df_malepct.Year<=2021)].loc[:, ['Region, subregion, country or area *', "Year", "65+"]]
df_malepct_ss.rename(columns={'Region, subregion, country or area *': "Country",  "65+":"male_pct_65+"}, inplace=True)

In [None]:
#add data to dataframe with yearly variables
df_yearlyvars = df_yearlyvars.join(df_malepct_ss.set_index(["Country","Year"]), on=["Country", "Year"])

In [None]:
#save dataframe with yearly variables to output
df_yearlyvars.to_csv(os.path.join(path_output, "yearly_pred_vars.csv"))