## Key Results

Takes in results of Monte Carlo simulations and generates key results used in the paper.

In [1]:
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import weightedstats as ws
from tabulate import tabulate

In [2]:
# Read country-level results from Monte Carlo sims
country_medians = pd.read_csv("../results/median_results.csv")
country_means = pd.read_csv("../results/mean_results.csv")
country_5th_percentiles = pd.read_csv("../results/5th_percentile_results.csv")
country_95th_percentiles = pd.read_csv("../results/95th_percentile_results.csv")

# Read district-level results from Monte Carlo sims
district_medians = pd.read_csv("../results/districts_median_results.csv")
district_means = pd.read_csv("../results/districts_mean_results.csv")
district_5th_percentiles = pd.read_csv("../results/districts_5th_percentile_results.csv")
district_95th_percentiles = pd.read_csv("../results/districts_95th_percentile_results.csv")


#### Data for Choropleths
Export country level and district level results for visualisations in Datawrapper

In [5]:
# Get ISOCODE and percent_without water and print without truncating
# pd.set_option('display.max_rows', None)
country_medians_simple = country_medians[["ISOCODE", "Entity", "percent_without_water"]]
country_medians_simple.to_csv("../results/country_medians_simple.csv", index=False)

venezuela_medians = district_medians[district_medians["Entity"] == "Venezuela"]
venezuela_medians_simple = venezuela_medians[["shapeName", "percent_without_water"]]
country_medians_simple.to_csv("../results/venezulea_means_simple.csv", index=False)

#### Key Median Results

In [6]:
# Make a single table with the 10 countries with the highest and lowest percentage of people without access to water
top10 = country_medians_simple.nlargest(10, "percent_without_water")
bottom10 = country_medians_simple.nsmallest(10, "percent_without_water")
top_bottom = pd.concat([top10, bottom10])
top_bottom.to_csv("../results/top_bottom_10_countries.csv", index=False)

In [7]:
# Calculate global population without water
global_median_percentage_without_water = country_medians["country_pop_without_water"].sum() / country_medians["country_pop_raw"].sum()
global_median_percentage_with_water = country_medians["country_pop_with_water"].sum() / country_medians["country_pop_raw"].sum()
global_median_percentage_piped_with_water = country_medians["population_piped_with_access"].sum() / country_medians["country_pop_raw"].sum()

# Print values
print(f"Global median percentage without water: {global_median_percentage_without_water}")
print(f"Global median percentage with water: {global_median_percentage_with_water}")
print(f"Global median percentage piped with water: {global_median_percentage_piped_with_water}")



Global median percentage without water: 0.2913699818564235
Global median percentage with water: 0.7084980986429009
Global median percentage piped with water: 0.39858155156271463


#### Key Mean Results (use to look at cycling vs walking breakdown)

In [8]:
# TODO update via Codium to add 5th and 95th percentiles
global_mean_percentage_piped_with_access = country_means["population_piped_with_access"].sum() / country_means["country_pop_raw"].sum()
global_mean_piped_percentage_with_cycling_access = country_means["population_piped_with_cycling_access"].sum() / country_means["population_piped_with_access"].sum()
global_mean_piped_percentage_with_walking_access = country_means["population_piped_with_walking_access"].sum() / country_means["population_piped_with_access"].sum()
global_mean_piped_percentage_with_only_cycling_access = (country_means["population_piped_with_access"].sum() - country_means["population_piped_with_walking_access"].sum()) / country_medians["population_piped_with_access"].sum()


# Print values
print(f"Global mean percentage piped with access: {global_mean_percentage_piped_with_access}")
print(f"Global mean percentage piped with cycling access: {global_mean_piped_percentage_with_cycling_access}")
print(f"Global mean percentage piped with walking access: {global_mean_piped_percentage_with_walking_access}")
print(f"Global mean percentage piped with only cycling access: {global_mean_piped_percentage_with_only_cycling_access}")

Global mean percentage piped with access: 0.38998753515565754
Global mean percentage piped with cycling access: 0.4612488577644812
Global mean percentage piped with walking access: 0.8389380275938462
Global mean percentage piped with only cycling access: 0.1575892345737465


#### Most important countries for bicycles

In [10]:
# TODO Change to means so walking/cycling split makes sense
# Calculate top 10 countries by lowest value for global_mean_piped_percentage_with_walking_access
country_medians["percentage_piped_with_walking_access"] = country_medians["population_piped_with_walking_access"] / country_medians["population_piped_with_access"]
country_medians["percentage_piped_with_cycling_access"] = country_medians["population_piped_with_cycling_access"] / country_medians["population_piped_with_access"]
country_medians["percentage_piped_of_total_access"] = country_medians["population_piped_with_access"] / country_medians["country_pop_with_water"]
country_medians["population_piped_with_only_cycling_access"] = country_medians["population_piped_with_access"] - country_medians["population_piped_with_walking_access"]
country_medians["percentage_piped_with_only_cycling_access"] = country_medians["population_piped_with_only_cycling_access"] / country_medians["population_piped_with_access"]

top10_piped_with_walking = country_medians.nsmallest(50, "percentage_piped_with_walking_access")

# print values
# print(f"Top 10 countries by lowest value for global_mean_piped_percentage_with_walking_access: {top10_piped_with_walking[['Entity', 'percentage_piped_with_walking_access', 'percentage_piped_with_cycling_access']]}")
# print(tabulate(top10_piped_with_walking[['Entity', 'percentage_piped_with_walking_access', 'percentage_piped_with_cycling_access', "percent_with_water", "population_piped_with_only_cycling_access"]], headers='keys', tablefmt='psql'))

# OR rank by cycling
bottom_piped_with_walking = country_medians.nlargest(30, "percentage_piped_with_only_cycling_access")
print(tabulate(bottom_piped_with_walking[['Entity', 'percentage_piped_with_only_cycling_access', "percentage_piped_of_total_access", "percent_with_water", "population_piped_with_only_cycling_access"]], headers='keys', tablefmt='psql'))
# print(tabulate(bottom_piped_with_walking, headers='keys', tablefmt='psql'))




+-----+----------------------+---------------------------------------------+------------------------------------+----------------------+---------------------------------------------+
|     | Entity               |   percentage_piped_with_only_cycling_access |   percentage_piped_of_total_access |   percent_with_water |   population_piped_with_only_cycling_access |
|-----+----------------------+---------------------------------------------+------------------------------------+----------------------+---------------------------------------------|
|   4 | United Arab Emirates |                                    1        |                        1           |            0.0313029 |                              3127.5         |
|  71 | Hong Kong            |                                    1        |                        0.243235    |           18.7134    |                            343786           |
|  93 | Kuwait               |                                    1        |         

In [11]:
# Piped water access by country
# create bubble chart from df_countries comprising: access to water, piped water, and using population as the size of the bubble
# create a new column for the size of the bubble

# create bubble size column as log 20 of country_pop_raw
country_medians["bubble_size"] = country_medians["country_pop_raw"] **0.5

px.scatter(
    country_medians, 
    y="percent_without_water", 
    x="Nat Piped", 
    size="bubble_size", 
    color="region", 
    hover_name="Entity", 
    # title="Access to Water vs. Piped Water vs. Population",
).update_layout(
    yaxis_title="Country population without access to water (%)",
    xaxis_title="Country piped water access (%)",
    legend_title="Region",
    width=1200,
    height=800,
    font=dict(size=25),
    # xaxis_tickangle=-45,
    # yaxis=dict(tickfont=dict(size=2)),
    # title=dict(font=dict(size=20)),
)

In [1]:
# now using scipy
from scipy.stats import spearmanr
correlation_scipy = spearmanr(country_medians["percent_without_water"], country_medians["Nat Piped"])
correlation_5th_scipy = spearmanr(country_5th_percentiles["percent_without_water"], country_5th_percentiles["Nat Piped"])
correlation_95th_scipy = spearmanr(country_95th_percentiles["percent_without_water"], country_95th_percentiles["Nat Piped"])
print(f"Spearman correlation between percent_without_water and Nat Piped (median) using scipy: {correlation_scipy}")
print(f"Spearman correlation between percent_without_water and Nat Piped (5th) using scipy: {correlation_5th_scipy}")
print(f"Spearman correlation between percent_without_water and Nat Piped (95th) using scipy: {correlation_95th_scipy}")

NameError: name 'country_medians' is not defined