In [None]:

import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import weightedstats as ws


# consider using this for weighted stats: http://www.ccgalberta.com/pygeostat/welcome.html



In [None]:
def weighted_mean(var, wts):
    """Calculates the weighted mean"""
    return np.average(var, weights=wts)

def weighted_median(df, val, weight):
    """Calculates the weighted median
    ArithmeticError
    If the sum of the weights is zero, or if the weights are not positive.
    """
    df_sorted = df.sort_values(val)
    cumsum = df_sorted[weight].cumsum()
    cutoff = df_sorted[weight].sum() / 2.
    return df_sorted[cumsum >= cutoff][val].iloc[0]

def weighted_variance(var, wts):
    """Calculates the weighted variance"""
    return np.average((var - weighted_mean(var, wts))**2, weights=wts)


def weighted_skew(var, wts):
    """Calculates the weighted skewness"""
    return (np.average((var - weighted_mean(var, wts))**3, weights=wts) /
            weighted_variance(var, wts)**(1.5))

def weighted_kurtosis(var, wts):
    """Calculates the weighted skewness"""
    return (np.average((var - weighted_mean(var, wts))**4, weights=wts) /
            weighted_variance(var, wts)**(2))

In [None]:

def load_data(data_file: str) -> pd.DataFrame:
    print 
    '''
    Load data from /data directory
    '''
    PATH = pathlib.Path().resolve()
    DATA_PATH = PATH.joinpath("../data").resolve()
    return pd.read_csv(DATA_PATH.joinpath(data_file))



In [None]:


df_zones = load_data("GIS/points_5min_15min_dtw_csv.csv")
# import df_input and set index as alpha 3 
df_input = load_data("country_data_with_cycling_distance.csv")
# set index to alpha 3
df_input.set_index("alpha3", inplace=True)

## set maximum distances reachable.... change this soon because we will reuse the bike etc...

df_zones["max distance walking"] = 3.0 # temp, set max distance reachbale for all countriesl, will change to be bespoke soon
df_input["max distance cycling"] = 8 * df_input["trip_velocity_mean"] # temp, set max distance reachbale for all countriesl, will change to be bespoke soon

In [None]:

# this analysis loses some data as the overlap between the rasters is not perfect. To reduce this error, use the 30 arc second data. Too much heavy lifting for my computer to do this at the moment.

#merge df_input and df_zones on ISO_CC (so we can use the UN population data)
df_zones = df_zones.merge(df_input, left_on="ISOCODE", right_on="alpha3")
df_zones["dtw_1"] = df_zones["dtw_1"] / 1000 # turn to kms

#adjust population to account for 9 values per raster point (2.5 to 5 arc min resoltuions. 9 values per point)
df_zones["AdjPopFloat"] = df_zones["pop_count_15_1"] / 9 

# # convert population density to percent of national population on a per country basis, grouped by ISO_CC
df_zones["pop_density_perc"] = df_zones.groupby("ISOCODE")["AdjPopFloat"].apply(lambda x: x / x.sum())

# multiply population density by population on a per country basis
df_zones["pop_zone"] = df_zones["pop_density_perc"] * df_zones["Population"]

# drop rows where pop_zone is close to zero
# min_pop = 100
# df_zones = df_zones[df_zones["pop_zone"] > min_pop]


In [None]:
# sum the population in each zone
df_zones["country_pop_raw"] = df_zones.groupby("ISOCODE")["pop_zone"].transform("sum")
df_zones["country_pop_ratio"] = df_zones.groupby("ISOCODE")["AdjPopFloat"].transform("sum")


In [None]:
# create new summary dataframe with country_pop_raw, country_pop_ratio, and population
df_summary = df_zones[["ISOCODE", "country_pop_raw", "country_pop_ratio", "Population"]].drop_duplicates()

df_summary.head()

The section below calculates the population per zone that can reach water

In [18]:
# population with piped water
df_zones["zone_pop_piped"] = df_zones["pop_zone"]*df_zones["Nat Piped"]/100
df_zones["zone_pop_unpiped"] = df_zones["pop_zone"]*df_zones["Nat NonPiped"]/100

# is it possible to reach water with walking/cycling
df_zones["zone_cycling_okay"] = (df_zones["dtw_1"] < df_zones["max distance cycling"])*1 # multiply by 1 to force to binary not true/false
df_zones["zone_walking_okay"] = (df_zones["dtw_1"] < df_zones["max distance walking"])*1

# how many people can collect water in the zone
df_zones["fraction_of_zone_with_cycling_access"] = df_zones["zone_cycling_okay"]* (df_zones["PBO"])/100
df_zones["fraction_of_zone_with_walking_access"] = df_zones["zone_walking_okay"] * 1

#
df_zones["population_piped_with_cycling_access"] = df_zones["fraction_of_zone_with_cycling_access"] * df_zones["zone_pop_piped"]
df_zones["population_piped_with_walking_access"] = df_zones["fraction_of_zone_with_walking_access"] * df_zones["zone_pop_piped"]

# select the maximum between the two
df_zones["population_piped_with_access"] = df_zones[["population_piped_with_cycling_access", "population_piped_with_walking_access"]].max(axis=1)

# zone pop without water
df_zones["zone_pop_with_water"] =  df_zones["population_piped_with_access"] + df_zones["zone_pop_unpiped"]
df_zones["zone_pop_without_water"] = df_zones["pop_zone"] - df_zones["zone_pop_with_water"]


## NICE! this gives the global population that needs to move.
# next steps, re-import that data from QGIS with populations more accurate. DONE
# then assign the achievable distnace cycling on a per country basis (using the mobility model, and the different inputs, road quality.)

# import fitness data.
# come up with bounds for the vairables between countries







In [22]:
df_zones.columns

Index(['fid', 'id', 'left', 'top', 'right', 'bottom', 'Value', 'ISOCODE',
       'UNSDCODE', 'NAME0', 'CIESINCODE', 'DATATYPE', 'DATACODE', 'DATAYEAR',
       'DATALEVEL', 'SEXLEVEL', 'AGELEVEL', 'GRSTART', 'GREND', 'GRLEVEL',
       'LASTCENSUS', 'MEANUNITKM', 'dtw_1', 'pop_count_15_1', 'AdjPop',
       'AdjPopFloat', 'max distance walking', 'Unnamed: 0', 'Entity',
       'Population', 'YearPBO', 'PBO', 'Terrain Ruggedness', 'Urban %',
       'Urban Agg %', 'RoadQuality', 'Km', 'Urb %',
       'Average household size (number of members)', 'National At Least Basic',
       'National Limit (more than 30 mins)', 'National Unimproved',
       'National Surface Water', 'Nat Accesible On Premises', 'Nat Piped',
       'Nat NonPiped', 'No. HPs', 'Year of HP estimate', '2015 (C)',
       'No. HPs in 2015', 'Non-functional HPs', 'Functional HPS', 'Risk Score',
       'alpha2', 'region', 'subregion', 'borders', 'Mean BMI (male)',
       'Mean BMI (female)', 'Mean male height (cm)', 'Mean female

In [27]:
# use groupby to create dataframe of country level data from df_zones
df_country = df_zones.groupby("ISOCODE").agg({
    "country_pop_raw": "mean",
    "zone_pop_with_water":"sum",
    "population_piped_with_access":"sum"  
    "zone_cycling_okay"
}).reset_index()


In [29]:
df_country

Unnamed: 0,ISOCODE,country_pop_raw,zone_pop_with_water
0,ABW,5.359750e+05,4.287800e+03
1,AFG,3.537890e+11,2.495993e+07
2,AGO,4.363482e+11,1.403533e+07
3,AIA,4.537500e+04,1.591726e+03
4,ALB,1.310017e+09,2.102785e+06
...,...,...,...
225,WSM,9.606912e+06,7.662563e+04
226,YEM,1.197774e+11,1.133909e+07
227,ZAF,5.873590e+11,3.123438e+07
228,ZMB,1.689307e+11,1.142746e+07


In [15]:

# calculate global totals
pop_with_water = df_zones["zone_pop_with_water"].sum()
pop_without_water = df_zones["zone_pop_without_water"].sum()
# and percentage
pop_with_water_perc = pop_with_water / (pop_with_water + pop_without_water)


## fix the below code to work 
print(f"Population with water: {pop_with_water}")
print(f"Population without water: {pop_without_water}")
print(f"Percentage of population with water: {pop_with_water_perc}")





Population with water: 5371208749.869296
Population without water: 2485956758.42751
Percentage of population with water: 0.6836064155957445


In [None]:

#create new dataframe with the id, pop_zone, ISO_CC, and distance_to_water columns
df2 = df_zones[[ "ISO_CC", "pop_zone", "distance_to_water1"]]
# drop all rows with nans
df2 = df2.dropna()



# initialize empty list
d = []
# loop through each country
for i in df2["ISO_CC"].unique():
    df_country = df2[df2["ISO_CC"] == i]
    # if rows in dataframe are more than 1, calculate weighted median
    if len(df_country) > 1:
        median = weighted_median(df_country, "distance_to_water1", "pop_zone")
        # median_ws = ws.weighted_median(df_country['distance_to_water1'], weights=df_country['pop_zone'])
        df_country_np = df_country.to_numpy()
        median_ws = ws.numpy_weighted_median(df_country_np[:, 2], weights=df_country_np[:, 1])

    else:
        median = df_country["distance_to_water1"].values[0]
        median_ws = median

    d.append(
        {
            "ISO_CC": i,
            "median": median,
            "median ws": median_ws
        }
    )

df_out = pd.DataFrame(d)
df_out = df_out.merge(df_input, left_on="ISO_CC", right_on="alpha3")

# remove any rows where the median is more than 1km more than "LBY" (Libya)'s median
max_distance = df_out.loc[df_out["ISO_CC"] == "LBY", "median"].values[0] + 1
df_out = df_out[df_out["median"] < max_distance]

In [None]:

# create new dataframe with ISO_CC, Entity, median, median ws, and Km
df_compare = df_out[["ISO_CC", "Entity", "median", "median ws", "Km"]]

#create bar plot of median distance, sorted by median distance
df_compare = df_compare.sort_values("median")
fig = px.bar(df_compare, x="Entity", y="median", color="ISO_CC", title="Median Distance to Water (km)")
fig.show()

In [None]:

# create plot of df_compare
fig = px.scatter(df_compare, x="median", y="Km", color="ISO_CC")

# add trendline

fig.show()



In [None]:




a= df2["distance_to_water1"] 
b= df2["pop_zone"]
binwidth = 1
plt.hist(a,bins=np.arange(0, 20 + binwidth, binwidth), weights=b)
# plt.hist(a,)
plt.show()

In [None]:
# set meidan value in df_input
#
# for i in df_input.index:
#     df2 = df2[df2["ISO_CC"] == i]


# df_input.loc[i]["median"]

# weighted_median(df2, "distance_to_water1", "pop_zone")