In [1]:
# Python 3 notebook

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
import matplotlib.pyplot as plt
from matplotlib import cm
import pandas as pd
from sklearn.manifold import TSNE
%matplotlib inline

In [2]:
# ############ initialize Sets for each region (as defined by USAID / DHS) to help with data cleaning ############
subsaharan_africa_set = {"AO", "BJ", "BU", "BF", "CG", "CD", "CM", "CF", "CI", "ET", "GA", "GH", "GM", "GN", "GY", "KE",
                         "KM", "LS", "LB", "MD", "MW", "ML", "MR", "MZ", "MA", "NM", "NI", "NG", "RW", "SN", "ZA", "SL",
                         "SZ", "TD", "TG", "TZ", "UG", "ZM", "ZW"}
north_africa_west_asia_europe_set = {"AL", "MB", "AZ", "AM", "EG", "JO", "YE"}
south_se_asia_set = {"AF", "BD", "IA", "ID", "KH", "LA", "NP", "PK", "TL", "MM", "PH"}
central_asia_set = {"KY", "TJ"}
latin_carib_set = {"CO", "HT", "HN", "PE", "DR", "BO"}

region_set = {"Sub-Saharan Africa", "North Africa/West Asia/Europe", "Central Asia", "South & Southeast Asia",
              "Latin America & Caribbean"}

In [3]:
# ############ pre-process cleaned_merged_data.csv (feature data) ############
# ############ and hi.csv (outcome data) #####################################
df = pd.read_csv('data/top20features_data.csv')
hi = pd.read_csv('data/hi.csv')

# parse DHSID to get country code for each entry ---------------------------
df['Country_Code'] = df['DHSID'].str.replace('\d+', '')
hi['Country_Code'] = hi['DHSID'].str.replace('\d+', '')

# create dictionaries of country_codes and region names to be used for data cleaning ---------------------------
country_code_set1 = set(df.Country_Code)
country_code_set2 = set(hi.Country_Code)
country_code_set = country_code_set1.union(country_code_set2)
country_code_dict = dict((j, i) for i, j in enumerate(country_code_set))
region_dict = dict((j, i) for i, j in enumerate(region_set))

# replace DHS country codes with BD for Bangladesh (fixing error in collected data) ---------------------------
df.loc[df["Country_Code"] == "DHS", "Country_Code"] = "BD"
hi.loc[hi["Country_Code"] == "DHS", "Country_Code"] = "BD"

# replace 'R' and 'U' with 0 and 1 ---------------------------
df.loc[df["URBAN_RURA_y"] == "R", "URBAN_RURA_y"] = 0
df.loc[df["URBAN_RURA_y"] == "U", "URBAN_RURA_y"] = 1

# Add country and region codes to dataset ---------------------------
df['target_region'] = ""
df['target_country'] = ""
hi['target_region'] = ""
hi['target_country'] = ""

# fill in feature data (df) -------------------------------------------
for i in np.arange(df.shape[0]):
    if df.loc[i, 'Country_Code'] in subsaharan_africa_set:
        df.loc[i, 'target_region'] = region_dict["Sub-Saharan Africa"]
    elif df.loc[i, 'Country_Code'] in north_africa_west_asia_europe_set:
        df.loc[i, 'target_region'] = region_dict["North Africa/West Asia/Europe"]
    elif df.loc[i, 'Country_Code'] in central_asia_set:
        df.loc[i, 'target_region'] = region_dict["Central Asia"]
    elif df.loc[i, 'Country_Code'] in south_se_asia_set:
        df.loc[i, 'target_region'] = region_dict["South & Southeast Asia"]
    elif df.loc[i, 'Country_Code'] in latin_carib_set:
        df.loc[i, 'target_region'] = region_dict["Latin America & Caribbean"]
    else:
        df.loc[i, 'target_region'] = None
        
for i in np.arange(df.shape[0]):
    df.loc[i, 'target_country'] = country_code_dict[df.loc[i, 'Country_Code']]
        
# fill in outcome data (hi) -------------------------------------------      
for i in np.arange(hi.shape[0]):
    if hi.loc[i, 'Country_Code'] in subsaharan_africa_set:
        hi.loc[i, 'target_region'] = region_dict["Sub-Saharan Africa"]
    elif hi.loc[i, 'Country_Code'] in north_africa_west_asia_europe_set:
        hi.loc[i, 'target_region'] = region_dict["North Africa/West Asia/Europe"]
    elif hi.loc[i, 'Country_Code'] in central_asia_set:
        hi.loc[i, 'target_region'] = region_dict["Central Asia"]
    elif hi.loc[i, 'Country_Code'] in south_se_asia_set:
        hi.loc[i, 'target_region'] = region_dict["South & Southeast Asia"]
    elif hi.loc[i, 'Country_Code'] in latin_carib_set:
        hi.loc[i, 'target_region'] = region_dict["Latin America & Caribbean"]
    else:
        hi.loc[i, 'target_region'] = None

for i in np.arange(hi.shape[0]):
    hi.loc[i, 'target_country'] = country_code_dict[hi.loc[i, 'Country_Code']]


  df['Country_Code'] = df['DHSID'].str.replace('\d+', '')
  hi['Country_Code'] = hi['DHSID'].str.replace('\d+', '')


In [4]:
# save cleaned dataset -------------------------------------------
df.to_csv('data/cleaned_top20features_data.csv')
hi.to_csv('data/cleaned_hi.csv')