### Establish GeoJSON Data for Use with Leaflet

- Query SQL database for all data measured with Age as dimension (for children under 5 years)
- Retrieve coordinates of each country in data for use in GeoJSON
- Create DataFrames for each Anthropometric Indicator
    - Overweight
    - Underweight
    - Stunting
    - Wasting
    - Wasting (Severe)
- Convert Pandas DataFrames to GeoJSON data using GeoPandas library
- Write GeoJSON data to file and embed in ./static/js/define_raw_geojson_data.js

In [1]:
## Import relevant modules. GeoPandas is used to convert DataFrame data into 
## GeoJSON data for use in web visualization
import pandas as pd
import numpy as np
import sqlite3
import requests
import pprint

import sys

import geopandas as gpd
from shapely.geometry import Point
import pprint
import geojson

#### Retrieve Geographic Coordinates for Unique Countries

In [2]:
## Define a function which takes a 3-letter ISO country code, and returns the geographic
## coordinates by retrieval them from an API endpoint
def get_country_coordinates(iso_code):
    # Establish API endpoint for coordinate retrieval based on country ISO code
    url = f"https://restcountries.com/v3.1/alpha/{iso_code}"
    
    # Send request to formatted API URL to GET coordinates based on ISO code
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        latlng = data[0]['latlng']
        ## latlng is a list which contains [latitude, longtidue]
        return latlng
    else:
        ## Return None if there's an erroneous response
        return None

In [3]:
## Define some debugging flags for convenience in order to skip certain code snippets
## during repeat runs
skip_coords_retrieval = True
write_coords_to_file = False
verbose_retrieval_status = False

In [4]:
## Load coordinates from existing CSV depending on skip flag,
## otherwise call get_country_coordinates for each country code
if(skip_coords_retrieval == True):
    coords_df = pd.read_csv("./data/coords.csv")
else:
    ## If retrieving coordinates, first query clean_data table 
    ## to get unique countries/codes
    conn = sqlite3.connect("./data/malnutrition_data.db")
    query = "SELECT * FROM clean_data"

    data_df = pd.read_sql_query(query, conn)

    ## Select all data where the measured dimension is Age (months)
    filtered_data_df = data_df.query("Dimension == 'Age (months)'")

    unique_countries = filtered_data_df["Country"].unique()
    unique_codes = filtered_data_df["Country ISO-3 Code"].unique()

    ## Remove Chile (CHL) from the list because it's missing an anthopometric indicator
    unique_countries = np.delete(unique_countries, 28)
    unique_codes = np.delete(unique_codes, 28)

    lats = []
    longs = []
    ## Populate lats and longs lists with coordinates from get_country_coordinates()
    ## to subsequently place into DataFrame
    if(verbose_retrieval_status == False):
        print("populating coordinates lists using get_country_coordinates()...")
    for code in unique_codes:
        temp_coords = get_country_coordinates(code)
        lats.append(temp_coords[0])
        longs.append(temp_coords[1])
        if(verbose_retrieval_status == True):
            print(f"added coords for {code}...")

    ## Assemble coords_df for later use when creating GeoJSON
    coords_df = pd.DataFrame();

    coords_df['country'] = unique_countries;
    coords_df['code'] = unique_codes;
    coords_df['lat'] = lats;
    coords_df['lon'] = longs;

    pprint.pp(coords_df)
    ## Write coordinates to file if flag is set
    if(write_coords_to_file == True):
        coords_df.to_csv("./data/coords.csv", index=False)

#### Query Database and Filter Relevant Data

In [5]:
## Establish connection with SQL database and query all data from clean_data table
conn = sqlite3.connect("./data/malnutrition_data.db")
query = "SELECT * FROM clean_data"

data_df = pd.read_sql_query(query, conn)

## Filter out data where measured dimension is Age (months)
filtered_data_df = data_df.query("Dimension == 'Age (months)'")

In [6]:
## Retrieve unique country and country code values
unique_countries = filtered_data_df["Country"].unique()
unique_codes = filtered_data_df["Country ISO-3 Code"].unique()

## Remove Chile (CHL) from list due to missing data
unique_countries = np.delete(unique_countries, 28)
unique_codes = np.delete(unique_codes, 28)

## Define empty list of DataFrames which will contains anthropometric indicators for each country
## List will be populated with DataFrames, one for each unique country
per_country_dfs = []

#### Enumerate Anthropometric Indicators for each Country (averaged through years)

In [7]:
## Grab metrics for each country, then append to per_country_dfs[]
for country_code in unique_codes:
    temp_df = filtered_data_df.query("`Country ISO-3 Code` == @country_code")
    per_country_dfs.append(temp_df)

## Define empty dictionary 
country_year_indicators_lookup_dict = {}

## Populate country_year_indicators_lookup_dict{} with DataFrames containing averaged
## anthropometric indicators across years
##
## Weights are maintained because researchers had 1 measurement per indicator, per year
for country_code in unique_codes:
    current_country_df = filtered_data_df.query("`Country ISO-3 Code` == @country_code")
    current_country_years = current_country_df["Year"].unique()

    for year in current_country_years:
        grouped_by_indicator_df = current_country_df.groupby("Anthropometric Indicator")["Prevalence Estimate %"].mean().reset_index()
        country_year_indicators_lookup_dict[country_code] = grouped_by_indicator_df

In [8]:
## Create prevalence_df which will contain country codes and all 5 associated indicators as columns
prevalence_df = pd.DataFrame()

prevalence_df["code"] = unique_codes;

## prevalence_lists_by_indicator{} needed to be created in order to find out that
## CHL (Chile) was missing an indicator
indicators = ["Overweight", "Stunting", "Underweight", "Wasting", "Wasting Severe"]
prevalence_lists_by_indicator = {}

for indicator in indicators:
    prevalence_lists_by_indicator[indicator] = []

In [9]:
### DEBUGGING BLOCK

## Iterate through items country_year_indicators_lookup_dict{} to find which country
## was missing an indicator measurement
for key, value in country_year_indicators_lookup_dict.items():
    for indicator in indicators:
        #print(value[value["Anthropometric Indicator"] == indicator])
        temp_list = value[value["Anthropometric Indicator"] == indicator]["Prevalence Estimate %"].to_list()
        if(len(temp_list) == 1):
            prevalence_lists_by_indicator[indicator].append(temp_list[0])
        else:
            print(key)

### END DEBUGGING BLOCK

#### Create Lists of Prevalence Values and Populate DataFrame

In [10]:
## Re-establish empty lists in prevalence_lists_by_indicator{}
for indicator in indicators:
    prevalence_lists_by_indicator[indicator] = []

## Populate each list with all average prevalence values in same order as country codes
for entry in country_year_indicators_lookup_dict.values():
    for indicator in indicators:
        prevalence = entry[entry["Anthropometric Indicator"] == indicator]["Prevalence Estimate %"]

        val = (prevalence.to_list())[0]
        
        prevalence_lists_by_indicator[indicator].append(val)

In [11]:
## Populate prevalence_df with lists of indicators, such that each row now
## contains a country code, and its 5 indicator averages
for indicator in indicators:
    prevalence_df[indicator] = prevalence_lists_by_indicator[indicator]

##### Print coordinate and prevalence DataFrames for reference

In [13]:
## Print coordinate and prevalence DataFrames for reference and sanity check
pprint.pp(coords_df)
pprint.pp(prevalence_df)

                            country code        lat         lon
0                       Afghanistan  AFG  33.000000   65.000000
1                           Albania  ALB  41.000000   20.000000
2                           Algeria  DZA  28.000000    3.000000
3                            Angola  AGO -12.500000   18.500000
4                         Argentina  ARG -34.000000  -64.000000
..                              ...  ...        ...         ...
146                        Viet Nam  VNM  16.166667  107.833333
147                           Yemen  YEM  15.000000   48.000000
148                          Zambia  ZMB -15.000000   30.000000
149                        Zimbabwe  ZWE -20.000000   30.000000
150  occupied Palestinian territory  PSE  31.900000   35.200000

[151 rows x 4 columns]
    code  Overweight   Stunting  Underweight    Wasting  Wasting Severe
0    AFG    4.984954  42.385391    22.937286   7.276614        2.770802
1    ALB   21.083658  21.440019     5.303072   7.021891        3

#### Create Pandas DataFrame for each Anthropometric Indicator

In [14]:
## Create a DataFrame for each layer, containing country code, coordinates, and prevalence 
## for each indicator
overweight_layer_df = coords_df.copy()
overweight_layer_df["prevalence"] = prevalence_df["Overweight"]
print("\noverweight layer DataFrame:")
pprint.pp(overweight_layer_df)

stunting_layer_df = coords_df.copy()
stunting_layer_df["prevalence"] = prevalence_df["Stunting"]
print("\nstunting layer DataFrame:")
pprint.pp(stunting_layer_df)

underweight_layer_df = coords_df.copy()
underweight_layer_df["prevalence"] = prevalence_df["Underweight"]
print("\nunderweight layer DataFrame:")
pprint.pp(underweight_layer_df)

wasting_layer_df = coords_df.copy()
wasting_layer_df["prevalence"] = prevalence_df["Wasting"]
print("\nwasting layer DataFrame:")
pprint.pp(wasting_layer_df)

wasting_severe_layer_df = coords_df.copy()
wasting_severe_layer_df["prevalence"] = prevalence_df["Wasting Severe"]
print("\nwasting severe layer DataFrame:")
pprint.pp(wasting_severe_layer_df)



overweight layer DataFrame:
                            country code        lat         lon  prevalence
0                       Afghanistan  AFG  33.000000   65.000000    4.984954
1                           Albania  ALB  41.000000   20.000000   21.083658
2                           Algeria  DZA  28.000000    3.000000   13.257227
3                            Angola  AGO -12.500000   18.500000    2.827835
4                         Argentina  ARG -34.000000  -64.000000   10.927755
..                              ...  ...        ...         ...         ...
146                        Viet Nam  VNM  16.166667  107.833333    3.383203
147                           Yemen  YEM  15.000000   48.000000    4.246249
148                          Zambia  ZMB -15.000000   30.000000    7.511327
149                        Zimbabwe  ZWE -20.000000   30.000000    6.979456
150  occupied Palestinian territory  PSE  31.900000   35.200000    7.626907

[151 rows x 5 columns]

stunting layer DataFrame:
        

#### Convert DataFrames to GeoJSON data using GeoPandas library

In [15]:
## Establish geometry for use with GeoPandas, using lon and lat columns in overweight_layer_df
##
## (coordinates for 5 metrics are understandably the same, so we can use them for all 
## 5 indicators)
geom = [Point(xy) for xy in zip(overweight_layer_df['lon'], overweight_layer_df['lat'])]

In [16]:
## Using GeoPandas, create a GeoDataFrame for each anthropometric indicator 
## (i.e., layer in Leaflet)
##
## Then, write each GeoDataFrame to a GeoJSON-formatted file in data/ directory

overweight_gdf = gpd.GeoDataFrame(overweight_layer_df, geometry=geom)
overweight_gdf.to_file("./data/overweight_gdf.geojson", driver='GeoJSON')

stunting_gdf = gpd.GeoDataFrame(stunting_layer_df, geometry=geom)
stunting_gdf.to_file("./data/stunting_gdf.geojson", driver='GeoJSON')

underweight_gdf = gpd.GeoDataFrame(underweight_layer_df, geometry=geom)
underweight_gdf.to_file("./data/underweight_gdf.geojson", driver='GeoJSON')

wasting_gdf = gpd.GeoDataFrame(wasting_layer_df, geometry=geom)
wasting_gdf.to_file("./data/wasting_gdf.geojson", driver='GeoJSON')

wasting_severe_gdf = gpd.GeoDataFrame(wasting_severe_layer_df, geometry=geom)
wasting_severe_gdf.to_file("./data/wasting_severe_gdf.geojson", driver='GeoJSON')