In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.ops import unary_union
from shapely.geometry import Point, MultiPolygon, Polygon, LineString
import topojson as tp
import h3
import os
import json
pd.set_option('display.max_columns', None)

In [61]:
def merge_time_columns(time_df):
    time_df['row_col'] = time_df.apply(lambda row: [row['row'], row['col']], axis=1)
    time_df['x_y'] = time_df.apply(lambda row: [row['x'], row['y']], axis=1)
    time_df['x_ras_y_ras'] = time_df.apply(lambda row: [row['x_ras'], row['y_ras']], axis=1)
    time_df.drop(['row', 'col', 'x', 'y', 'x_ras', 'y_ras'], axis=1, inplace=True)
    return time_df

def join_points_poly(points_df, poly_df, join_rules):
    merged = poly_df
    joined_df = gpd.sjoin(points_df, poly_df)
    grouped = joined_df.groupby('index_right').agg(join_rules)
    merged = merged.merge(grouped, left_index=True, right_index=True)
    return merged

# create a dictionary that is the  names of the column names in hex8 and the values are an empty list
def create_style_dictionary(df):
    style_hex_dict = {}
    for col in df.columns: 
        style_hex_dict[col] = { 'fill-color': [ "'interpolate', ['linear'], ['get', '"+col+"'], dataMap['"+col+"'][3]['style_stops_99'][0], palettes[dataMap['"+col+"'][2]][0], dataMap['"+col+"'][3]['style_stops_99'][1], palettes[dataMap['"+col+"'][2]][1], dataMap['"+col+"'][3]['style_stops_99'][2], palettes[dataMap['"+col+"'][2]][2], dataMap['"+col+"'][3]['style_stops_99'][3], palettes[dataMap['"+col+"'][2]][3], dataMap['"+col+"'][3]['style_stops_99'][4], palettes[dataMap['"+col+"'][2]][4]"]}
    return style_hex_dict

In [62]:
# Read in destination files as geodataframes
path = './unsynced-data/rwanda/travel-time'

# list the files in the path directory and if they end in .parquet, add them to a list called files
files = [file for file in os.listdir(path) if file.endswith('.parquet')]

datasets = []
for file in files:
    df = pd.read_parquet(path + '/' + file)
    # Convert the DataFrame to a GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['x_ras'], df['y_ras']))
    gdf.crs = 'EPSG:4326'
    datasets.append(gdf)

# Now datasets list contains GeoDataFrames

In [63]:
# doing some basic stuff with each dataframe
# for range of number of lists in datasets print the datasets number

for d in range(len(datasets)):
    print ("starting " +files[d]) 
    datasets[d] = datasets[d].replace([np.inf, -np.inf], None)
    datasets[d] = datasets[d].replace([np.nan], None)
    datasets[d] = merge_time_columns(datasets[d])
    # appending destination type to travel time and time delta columns
    datasets[d] = datasets[d].rename(columns={col: col + files[d][14:-8] for col in datasets[d].columns if col.startswith('travel_time')})
    datasets[d] = datasets[d].rename(columns={col: col + files[d][14:-8] for col in datasets[d].columns if col.startswith('time_delta')})
    
    datasets[d]['row_col'] = datasets[d]['row_col'].astype(str)
    print ("finished with " +files[d]) 

starting travel_time_to_secondary_schools_fixed.parquet
finished with travel_time_to_secondary_schools_fixed.parquet
starting travel_time_to_semi_dense_urban_optimal.parquet
finished with travel_time_to_semi_dense_urban_optimal.parquet
starting travel_time_to_major_hospitals_optimal.parquet
finished with travel_time_to_major_hospitals_optimal.parquet
starting travel_time_to_health_posts_optimal.parquet
finished with travel_time_to_health_posts_optimal.parquet
starting travel_time_to_health_centers_optimal.parquet
finished with travel_time_to_health_centers_optimal.parquet
starting travel_time_to_all_education_facilities_fixed.parquet
finished with travel_time_to_all_education_facilities_fixed.parquet
starting travel_time_to_primary_schools_fixed.parquet
finished with travel_time_to_primary_schools_fixed.parquet
starting travel_time_to_all_health_facilities_optimal.parquet
finished with travel_time_to_all_health_facilities_optimal.parquet


In [64]:
# merging all dataframes into one
# each dataframe brings it's unique travel times and travel deltas
# I make the first df be the main one and then merge all of the rest into it.
travel_time_merged = datasets[0].set_index('row_col')
for dataset in datasets[1:]:
    # make a list of values from columns that we're going to merge into the dataframe
    dataset = dataset.set_index('row_col')
    merge_cols = []
    for c in dataset.columns:
        if c.startswith('travel_time') or c.startswith('time_delta'):
            merge_cols.append(c)
    # merge the travel_time_merged df with the datasets[d] df on the row_col column
    travel_time_merged = travel_time_merged.join(dataset[merge_cols], how='left')
    # replace NaN values in travel_time_merged with values from datasets[d] wherever they exist
    travel_time_merged = dataset[merge_cols].combine_first(travel_time_merged)
    # fill NA values with values from dataset
    travel_time_merged = travel_time_merged.fillna(dataset)

travel_time_merged = travel_time_merged.reset_index()
# convert travel_time_merged to geodataframe
travel_time_merged = gpd.GeoDataFrame(travel_time_merged, geometry='geometry')
travel_time_merged.crs = "EPSG:4326"

In [65]:
# drop x_ras_y_ras column
travel_time_merged = travel_time_merged.drop(['x_ras_y_ras'], axis=1)

In [66]:
travel_time_merged

Unnamed: 0,row_col,births,female_educational_attainment_mean,females_0_4,females_0_9,females_10_14,females_15_49,females_50_64,females_5_9,females_65_plus,geometry,male_educational_attainment_mean,males_0_4,males_0_9,males_10_14,males_15_49,males_50_64,males_5_9,males_65_plus,pop_0_4,pop_0_9,pop_10_14,pop_15_49,pop_50_64,pop_5_9,pop_65_plus,population,pregnancies,rwi,time_delta_constructed_sites_all_education_facilities_fixed,time_delta_constructed_sites_all_health_facilities_optimal,time_delta_constructed_sites_health_centers_optimal,time_delta_constructed_sites_health_posts_optimal,time_delta_constructed_sites_major_hospitals_optimal,time_delta_constructed_sites_primary_schools_fixed,time_delta_constructed_sites_secondary_schools_fixed,time_delta_constructed_sites_semi_dense_urban_optimal,time_delta_no_sites_all_education_facilities_fixed,time_delta_no_sites_all_health_facilities_optimal,time_delta_no_sites_health_centers_optimal,time_delta_no_sites_health_posts_optimal,time_delta_no_sites_major_hospitals_optimal,time_delta_no_sites_primary_schools_fixed,time_delta_no_sites_secondary_schools_fixed,time_delta_no_sites_semi_dense_urban_optimal,travel_time_all_education_facilities_fixed,travel_time_all_health_facilities_optimal,travel_time_constructed_sites_all_education_facilities_fixed,travel_time_constructed_sites_all_health_facilities_optimal,travel_time_constructed_sites_health_centers_optimal,travel_time_constructed_sites_health_posts_optimal,travel_time_constructed_sites_major_hospitals_optimal,travel_time_constructed_sites_primary_schools_fixed,travel_time_constructed_sites_secondary_schools_fixed,travel_time_constructed_sites_semi_dense_urban_optimal,travel_time_health_centers_optimal,travel_time_health_posts_optimal,travel_time_major_hospitals_optimal,travel_time_no_sites_all_education_facilities_fixed,travel_time_no_sites_all_health_facilities_optimal,travel_time_no_sites_health_centers_optimal,travel_time_no_sites_health_posts_optimal,travel_time_no_sites_major_hospitals_optimal,travel_time_no_sites_primary_schools_fixed,travel_time_no_sites_secondary_schools_fixed,travel_time_no_sites_semi_dense_urban_optimal,travel_time_primary_schools_fixed,travel_time_secondary_schools_fixed,travel_time_semi_dense_urban_optimal,underweight,x_y
0,"[10, 1897]",0.2158,4.724257,0.532922,1.102213,0.570825,1.990846,0.334852,0.569291,0.116536,POINT (30.43833 -1.05417),5.613448,0.523792,1.12397,0.606819,2.048758,0.421264,0.600178,0.120037,1.056714,2.226183,1.177644,4.039604,0.756116,1.169469,0.236573,8.436119,0.3056,-0.088528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,81.0,82.0,81.0,82.0,82.0,253.0,870.0,81.0,81.0,507.0,82.0,253.0,870.0,81.0,82.0,82.0,253.0,870.0,81.0,81.0,507.0,81.0,81.0,507.0,0.09251,"[30.4384884, -1.0540862]"
1,"[10, 1898]",0.203684,4.725142,0.503002,1.04033,0.538776,1.879072,0.316052,0.537329,0.109993,POINT (30.43917 -1.05417),5.613625,0.494384,1.060866,0.57275,1.933732,0.397613,0.566482,0.113297,0.997386,2.101196,1.111526,3.812804,0.713665,1.10381,0.22329,7.962481,0.288443,-0.085817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79.0,80.0,79.0,80.0,80.0,251.0,868.0,79.0,79.0,505.0,80.0,251.0,868.0,79.0,80.0,80.0,251.0,868.0,79.0,79.0,505.0,79.0,79.0,505.0,0.092532,"[30.439166560383864, -1.0541664609655237]"
2,"[10, 1899]",0.373898,4.731713,0.923348,1.909709,0.989018,3.449366,0.580169,0.986361,0.201912,POINT (30.44000 -1.05417),5.61857,0.907529,1.947405,1.051382,3.549704,0.729888,1.039876,0.207977,1.830877,3.857114,2.040401,6.999069,1.310057,2.026237,0.409889,14.616529,0.529487,-0.084257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,78.0,77.0,78.0,78.0,249.0,866.0,77.0,77.0,503.0,78.0,249.0,866.0,77.0,78.0,78.0,249.0,866.0,77.0,77.0,503.0,77.0,77.0,503.0,0.092703,"[30.439999893713928, -1.0541664609655237]"
3,"[10, 1900]",0.579211,4.735256,1.430372,2.958358,1.532103,5.343463,0.898748,1.527986,0.312784,POINT (30.44083 -1.05417),5.620417,1.405866,3.016753,1.628711,5.498899,1.13068,1.610887,0.32218,2.836238,5.975111,3.160814,10.84236,2.029428,3.138873,0.634964,22.642679,0.820236,-0.076404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,75.0,74.0,75.0,75.0,246.0,863.0,74.0,74.0,500.0,75.0,246.0,863.0,74.0,75.0,75.0,246.0,863.0,74.0,74.0,500.0,74.0,74.0,500.0,0.092798,"[30.441127700000003, -1.0544563]"
4,"[10, 1901]",0.098616,4.745918,0.560873,1.160023,0.600764,2.095264,0.352415,0.599149,0.122648,POINT (30.44167 -1.05417),5.628995,0.551264,1.182921,0.638646,2.156212,0.443359,0.631657,0.126332,1.112138,2.342944,1.239409,4.251475,0.795774,1.230806,0.24898,8.878583,0.139653,-0.076883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,73.0,72.0,73.0,73.0,244.0,861.0,72.0,72.0,498.0,73.0,244.0,861.0,72.0,73.0,73.0,244.0,861.0,72.0,72.0,498.0,72.0,72.0,498.0,0.093076,"[30.44174195001564, -1.0543765000610064]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152839,"[999, 992]",0.178967,4.859198,0.377632,0.809466,0.395285,1.67821,0.242193,0.431834,0.112271,POINT (29.68417 -1.87833),5.03234,0.384516,0.817663,0.392323,1.613602,0.204346,0.433148,0.095817,0.762147,1.627129,0.787607,3.291812,0.44654,0.864982,0.208088,6.361176,0.25344,0.048897,0.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,13.0,80.0,13.0,80.0,114.0,80.0,448.0,13.0,65.0,245.0,114.0,80.0,380.0,13.0,80.0,114.0,80.0,448.0,13.0,65.0,245.0,13.0,65.0,245.0,0.081016,"[29.6840194, -1.8780588]"
1152840,"[999, 993]",,,,,,,,,,POINT (29.68500 -1.87833),,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,0.0,0.0,0.0,15.0,77.0,15.0,77.0,116.0,77.0,445.0,15.0,67.0,247.0,116.0,77.0,382.0,15.0,77.0,116.0,77.0,445.0,15.0,67.0,247.0,15.0,67.0,247.0,,"[29.6846658, -1.8779288]"
1152841,"[999, 995]",0.187445,4.809318,0.39552,0.847811,0.41401,1.757708,0.253666,0.452291,0.117589,POINT (29.68667 -1.87833),4.926599,0.402731,0.856397,0.410907,1.690039,0.214026,0.453666,0.100356,0.798251,1.704208,0.824917,3.447747,0.467693,0.905957,0.217945,6.662509,0.265446,-0.101721,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0.0,21.0,71.0,21.0,71.0,118.0,71.0,439.0,21.0,69.0,249.0,118.0,71.0,386.0,21.0,71.0,118.0,71.0,439.0,21.0,69.0,249.0,21.0,69.0,249.0,0.080347,"[29.68666656321098, -1.8783331242630903]"
1152842,"[999, 996]",0.183308,4.81043,0.386791,0.8291,0.404872,1.718915,0.248068,0.442308,0.114994,POINT (29.68750 -1.87833),4.928182,0.393842,0.837496,0.401838,1.652739,0.209303,0.443654,0.098141,0.780633,1.666595,0.80671,3.371655,0.457371,0.885962,0.213135,6.515466,0.259587,-0.108356,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,22.0,69.0,22.0,69.0,118.0,69.0,437.0,22.0,69.0,249.0,118.0,69.0,387.0,22.0,69.0,118.0,69.0,437.0,22.0,69.0,249.0,22.0,69.0,249.0,0.080333,"[29.6871115, -1.8780878]"


In [67]:
travel_time_merged_proof = travel_time_merged.copy()

In [68]:
# change travel_time_merged['x_y'] to a string separated by a _
travel_time_merged['x_y'] = travel_time_merged['x_y'].astype(str).str.replace('[', '').str.replace(']', '').str.replace(' ', '').str.replace(',', '_')
# change row_col to a string separated by a _
travel_time_merged['row_col'] = travel_time_merged['row_col'].astype(str).str.replace('[', '').str.replace(']', '').str.replace(' ', '').str.replace(',', '_')

In [69]:
# make rwi equal to rwi times the population value in it's row
travel_time_merged['rwi'] = travel_time_merged['rwi'] * travel_time_merged['population']
travel_time_merged['underweight'] = travel_time_merged['underweight'] * travel_time_merged['population']
# travel_time_merged['male_educational_attainment_mean'] equals the sum the value of male_educational_attainment_mean times the sum of 'males_15_49'+'males_50_64'+'males_65_plus'
travel_time_merged['male_educational_attainment_mean'] = travel_time_merged['male_educational_attainment_mean'] * (travel_time_merged['males_15_49']+travel_time_merged['males_50_64']+travel_time_merged['males_65_plus'])
travel_time_merged['female_educational_attainment_mean'] = travel_time_merged['female_educational_attainment_mean'] * (travel_time_merged['females_15_49']+travel_time_merged['females_50_64']+travel_time_merged['females_65_plus'])


In [70]:
def aggregate_lists(x):
    return '&'.join(x)
    
def male_educational_attainment_weighted_average(x):
    # Calculate the weighted average using np.average and weights based on sum of males
    total_males = travel_time_merged.loc[x.index, ['males_15_49', 'males_50_64', 'males_65_plus']].sum(axis=1)
    total_weight = total_males.sum()
    if total_weight == 0:
        # If the total weight is zero, return a default value (e.g., 0) or handle it as per your requirement.
        return 0
    non_zero_mask = (total_males > 0) & (~x.isnull()) # Mask to avoid dividing by zero
    if non_zero_mask.any():
        weighted_avg = np.average(x[non_zero_mask], weights=total_males[non_zero_mask])
        return weighted_avg
    else:
        weighted_avg = np.nan
        return weighted_avg
def female_educational_attainment_weighted_average(x):
    # Calculate the weighted average using np.average and weights based on sum of females
    total_females = travel_time_merged.loc[x.index, ['females_15_49', 'females_50_64', 'females_65_plus']].sum(axis=1)
    total_weight = total_females.sum()
    
    if total_weight == 0:
        # If the total weight is zero, return a default value (e.g., 0) or handle it as per your requirement.
        return 0
    
    non_zero_mask = (total_females > 0) & (~x.isnull())  # Mask to avoid dividing by zero
    if non_zero_mask.any():
        weighted_avg = np.average(x[non_zero_mask], weights=total_females[non_zero_mask])
        return weighted_avg
    else:
        weighted_avg = np.nan
        return weighted_avg

def weighted_average_function(values, weights):
    if weights.sum() == 0:
        return np.nan
    if weights.sum() == np.nan:
        return np.nan
    if weights.sum() == np.inf:
        return np.nan
    else:
        return np.average(values, weights=weights)


joining_rules = {
'row_col': aggregate_lists,
'x_y': aggregate_lists,
'female_educational_attainment_mean': 'sum', 
'females_0_4': 'sum', 
'females_0_9': 'sum', 
'females_10_14': 'sum', 
'females_15_49': 'sum', 
'females_50_64': 'sum', 
'females_5_9': 'sum', 
'females_65_plus': 'sum', 
'male_educational_attainment_mean': 'sum',
'males_0_4': 'sum', 
'males_0_9': 'sum', 
'males_10_14': 'sum', 
'males_15_49': 'sum', 
'males_50_64': 'sum', 
'males_5_9': 'sum', 
'males_65_plus': 'sum', 
'pop_0_4': 'sum', 
'pop_0_9': 'sum', 
'pop_10_14': 'sum', 
'pop_15_49': 'sum', 
'pop_50_64': 'sum', 
'pop_5_9': 'sum', 
'pop_65_plus': 'sum', 
'population': 'sum', 
'pregnancies': 'sum',
'births': 'sum', 
'rwi': 'sum',
'underweight': 'sum', 
'time_delta_constructed_sites_all_education_facilities_fixed': 'mean', 
'time_delta_constructed_sites_all_health_facilities_optimal': 'mean', 
'time_delta_constructed_sites_health_centers_optimal': 'mean', 
'time_delta_constructed_sites_health_posts_optimal': 'mean', 
'time_delta_constructed_sites_major_hospitals_optimal': 'mean', 
'time_delta_constructed_sites_primary_schools_fixed': 'mean', 
'time_delta_constructed_sites_secondary_schools_fixed': 'mean', 
'time_delta_constructed_sites_semi_dense_urban_optimal': 'mean', 
'time_delta_no_sites_all_education_facilities_fixed': 'mean', 
'time_delta_no_sites_all_health_facilities_optimal': 'mean', 
'time_delta_no_sites_health_centers_optimal': 'mean', 
'time_delta_no_sites_health_posts_optimal': 'mean', 
'time_delta_no_sites_major_hospitals_optimal': 'mean', 
'time_delta_no_sites_primary_schools_fixed': 'mean', 
'time_delta_no_sites_secondary_schools_fixed': 'mean', 
'time_delta_no_sites_semi_dense_urban_optimal': 'mean', 
'travel_time_all_education_facilities_fixed': 'mean', 
'travel_time_all_health_facilities_optimal': 'mean', 
'travel_time_constructed_sites_all_education_facilities_fixed': 'mean', 
'travel_time_constructed_sites_all_health_facilities_optimal': 'mean', 
'travel_time_constructed_sites_health_centers_optimal': 'mean', 
'travel_time_constructed_sites_health_posts_optimal': 'mean', 
'travel_time_constructed_sites_major_hospitals_optimal': 'mean', 
'travel_time_constructed_sites_primary_schools_fixed': 'mean', 
'travel_time_constructed_sites_secondary_schools_fixed': 'mean', 
'travel_time_constructed_sites_semi_dense_urban_optimal': 'mean', 
'travel_time_health_centers_optimal': 'mean', 
'travel_time_health_posts_optimal': 'mean', 
'travel_time_major_hospitals_optimal': 'mean', 
'travel_time_no_sites_all_education_facilities_fixed': 'mean', 
'travel_time_no_sites_all_health_facilities_optimal': 'mean', 
'travel_time_no_sites_health_centers_optimal': 'mean', 
'travel_time_no_sites_health_posts_optimal': 'mean', 
'travel_time_no_sites_major_hospitals_optimal': 'mean', 
'travel_time_no_sites_primary_schools_fixed': 'mean', 
'travel_time_no_sites_secondary_schools_fixed': 'mean', 
'travel_time_no_sites_semi_dense_urban_optimal': 'mean', 
'travel_time_primary_schools_fixed': 'mean', 
'travel_time_secondary_schools_fixed': 'mean', 
'travel_time_semi_dense_urban_optimal': 'mean',
}

In [71]:
# This is causing all values in a row to be the same.. need to fix
def make_hexagons(resolution, write_to_file):
    # make travel_time_merged_hex a geopandas dataframe that matches travel_time_merged
    travel_time_merged_hex = travel_time_merged.copy()
    travel_time_merged_hex = gpd.GeoDataFrame(travel_time_merged_hex, geometry='geometry')
    travel_time_merged_hex.crs = "EPSG:4326"

    travel_time_merged_hex['h3-index'] = None

    for idx, row in travel_time_merged_hex.iterrows():
        try:
            lat = row['geometry'].y
            lon = row['geometry'].x
            travel_time_merged_hex.at[idx, 'h3-index'] = h3.geo_to_h3(lat, lon, resolution)
        except:
            pass

    # merge all rows that match on h3-index following joining rules
    travel_time_merged_hex = travel_time_merged_hex.groupby('h3-index').agg(joining_rules)
    travel_time_merged_hex = travel_time_merged_hex.reset_index()
    # drop any rows without a h3-index
    travel_time_merged_hex = travel_time_merged_hex.dropna(subset=['h3-index'])
    # convert h3-index to polygon
    travel_time_merged_hex['geometry'] = travel_time_merged_hex['h3-index'].apply(lambda x: h3.h3_to_geo_boundary(x, True))
    # make geometry column a polygon
    travel_time_merged_hex['geometry'] = travel_time_merged_hex['geometry'].apply(lambda x: Polygon(x))
    # convert travel_time_merged_hex to geodataframe
    travel_time_merged_hex = gpd.GeoDataFrame(travel_time_merged_hex, geometry='geometry')
    travel_time_merged_hex.crs = "EPSG:4326"
    if write_to_file == True:
        travel_time_merged_hex.to_file("./synced-data/rwa_travel_time_hex-"+str(resolution)+".geojson", driver='GeoJSON', na='null')
    return travel_time_merged_hex

In [72]:
hex8 = make_hexagons(8, False)

In [73]:
# Normalizing the weighted population columns
hex8['rwi'] = hex8.apply(lambda row: row['rwi'] / row['population'] if row['population'] != 0 else np.nan, axis=1)
hex8['underweight'] = hex8.apply(lambda row: row['underweight'] / row['population'] if row['population'] != 0 else np.nan, axis=1)
hex8['male_educational_attainment_mean'] = hex8.apply(lambda row: row['male_educational_attainment_mean'] / (row['males_15_49'] + row['males_50_64'] + row['males_65_plus']) if (row['males_15_49'] + row['males_50_64'] + row['males_65_plus']) != 0 else np.nan, axis=1)
hex8['female_educational_attainment_mean'] = hex8.apply(lambda row: row['female_educational_attainment_mean'] / (row['females_15_49'] + row['females_50_64'] + row['females_65_plus']) if (row['females_15_49'] + row['females_50_64'] + row['females_65_plus']) != 0 else np.nan, axis=1)

In [74]:
# Checking to see normalizaton worked. These should be just above 8
print(hex8['female_educational_attainment_mean'].max())
print(hex8['male_educational_attainment_mean'].max())

8.725735189827622
8.966570056772278


In [75]:
# should do this after merging with hexagons so  that we don't lose precision
# change datatypes of columns
int_list = ['time_delta_constructed_sites_all_education_facilities_fixed', 'time_delta_constructed_sites_all_health_facilities_optimal', 'time_delta_constructed_sites_health_centers_optimal', 'time_delta_constructed_sites_health_posts_optimal', 'time_delta_constructed_sites_major_hospitals_optimal', 'time_delta_constructed_sites_primary_schools_fixed', 'time_delta_constructed_sites_secondary_schools_fixed', 'time_delta_constructed_sites_semi_dense_urban_optimal', 'time_delta_no_sites_all_education_facilities_fixed', 'time_delta_no_sites_all_health_facilities_optimal', 'time_delta_no_sites_health_centers_optimal', 'time_delta_no_sites_health_posts_optimal', 'time_delta_no_sites_major_hospitals_optimal', 'time_delta_no_sites_primary_schools_fixed', 'time_delta_no_sites_secondary_schools_fixed', 'time_delta_no_sites_semi_dense_urban_optimal', 'travel_time_all_education_facilities_fixed', 'travel_time_all_health_facilities_optimal', 'travel_time_constructed_sites_all_education_facilities_fixed', 'travel_time_constructed_sites_all_health_facilities_optimal', 'travel_time_constructed_sites_health_centers_optimal', 'travel_time_constructed_sites_health_posts_optimal', 'travel_time_constructed_sites_major_hospitals_optimal', 'travel_time_constructed_sites_primary_schools_fixed', 'travel_time_constructed_sites_secondary_schools_fixed', 'travel_time_constructed_sites_semi_dense_urban_optimal', 'travel_time_health_centers_optimal', 'travel_time_health_posts_optimal', 'travel_time_major_hospitals_optimal', 'travel_time_no_sites_all_education_facilities_fixed', 'travel_time_no_sites_all_health_facilities_optimal', 'travel_time_no_sites_health_centers_optimal', 'travel_time_no_sites_health_posts_optimal', 'travel_time_no_sites_major_hospitals_optimal', 'travel_time_no_sites_primary_schools_fixed', 'travel_time_no_sites_secondary_schools_fixed', 'travel_time_no_sites_semi_dense_urban_optimal', 'travel_time_primary_schools_fixed', 'travel_time_secondary_schools_fixed', 'travel_time_semi_dense_urban_optimal', 'males_0_4', 'males_0_9', 'males_10_14', 'males_15_49', 'males_50_64', 'males_5_9', 'males_65_plus', 'pop_0_4', 'pop_0_9', 'pop_10_14', 'pop_15_49', 'pop_50_64', 'pop_5_9', 'pop_65_plus', 'population', 'females_0_4', 'females_0_9', 'females_10_14', 'females_15_49', 'females_50_64', 'females_5_9', 'females_65_plus', 'births', 'pregnancies', ]
float_list = ['rwi', 'underweight', 'female_educational_attainment_mean', 'male_educational_attainment_mean']
string_list = ['x_y', 'row_col']
for col in int_list:
    hex8[col] = hex8[col].replace([np.inf, -np.inf, np.nan], np.nan).round(0).astype('Int64')
for col in float_list:
    hex8[col] = pd.to_numeric(hex8[col].replace([np.inf, -np.inf], np.nan), errors='coerce', downcast='float').round(4)


for d in hex8.dtypes:
    print(d)

object
object
object
float32
Int64
Int64
Int64
Int64
Int64
Int64
Int64
float32
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
float32
float32
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
Int64
geometry


In [76]:
# drop x_y from hex8. We won't use this in the app
hex8 = hex8.drop(['x_y'], axis=1)

In [111]:
hex8.to_file("./unsynced-data/rwanda/rwa_travel_time_hex-8-staging.geojson", driver='GeoJSON', index=False)

# Creating a hex to subregion lookup table

In [86]:
# create a dataframe with just h3-index and row_col
lookup = hex8[['h3-index', 'row_col']]

In [87]:
data_list = []

for idx, row in lookup.iterrows():
    row_col_list = row['row_col'].split('&')
    for row_col in row_col_list:
        data_list.append({'h3-index': row['h3-index'], 'row_col': row_col})

# create the DataFrame
lookup_full = pd.DataFrame(data_list, columns=['h3-index', 'row_col'])

In [88]:
lookup_full

Unnamed: 0,h3-index,row_col
0,886ad80001fffff,1334_1576
1,886ad80001fffff,1335_1573
2,886ad80001fffff,1335_1574
3,886ad80001fffff,1335_1575
4,886ad80001fffff,1335_1576
...,...,...
1150599,886adeb76dfffff,760_600
1150600,886adeb76dfffff,760_601
1150601,886adeb76dfffff,760_602
1150602,886adeb76dfffff,760_603


In [89]:
# how many unique values of h3-index are there
len(lookup_full['h3-index'].unique())

26184

In [90]:
# write lookup_full to json file
lookup_full.to_json('./synced-data/subregion-to-hex-index.json', orient='records')

In [91]:
# write lookup_full to csv file
lookup_full.to_csv('./synced-data/subregion-to-hex-index.csv', index=False)

# Generate Impact Scores

In [113]:
# run if starting here
hex8 = gpd.read_file('./unsynced-data/rwanda/rwa_travel_time_hex-8-staging.geojson')

In [114]:
school_bins = [30, 45, 60]
health_bins = [45, 90, 135]
hospital_bins = [60, 120, 180]
market_bins = [60, 120, 180]

In [115]:
scoring_rubric =[[0, 0, 0, 0], [2, 0, 0, 0], [4, 2, 0, 0], [8, 4, 2, 0]]

def calculate_impact_score(before_travel_time, after_travel_time, bins):
    # Calculate the bin indices for before and after travel times
    before_bin = np.digitize(before_travel_time, bins)
    after_bin = np.digitize(after_travel_time, bins)
    
    # Initialize an array to store impact scores
    impact_scores = np.zeros(len(before_travel_time))
    
    # Loop through each row and calculate the impact score
    for i in range(len(before_travel_time)):
        impact_scores[i] = scoring_rubric[before_bin[i]][after_bin[i]]
        if impact_scores[i] == 0 and before_travel_time[i] != after_travel_time[i] and not np.isnan(after_travel_time[i]):
            print(before_travel_time[i], after_travel_time[i], before_bin[i], after_bin[i])
            impact_scores[i] = 1

    return impact_scores

In [None]:
hex8['all_education_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_all_education_facilities_fixed'], hex8['travel_time_all_education_facilities_fixed'], school_bins)
hex8['all_education_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_all_education_facilities_fixed'], hex8['travel_time_constructed_sites_all_education_facilities_fixed'], school_bins)
hex8['all_health_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_all_health_facilities_optimal'], hex8['travel_time_all_health_facilities_optimal'], health_bins)
hex8['all_health_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_all_health_facilities_optimal'], hex8['travel_time_constructed_sites_all_health_facilities_optimal'], health_bins)
hex8['health_centers_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_health_centers_optimal'], hex8['travel_time_health_centers_optimal'], health_bins)
hex8['health_centers_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_health_centers_optimal'], hex8['travel_time_constructed_sites_health_centers_optimal'], health_bins)
hex8['health_posts_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_health_posts_optimal'], hex8['travel_time_health_posts_optimal'], health_bins)
hex8['health_posts_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_health_posts_optimal'], hex8['travel_time_constructed_sites_health_posts_optimal'], health_bins)
hex8['major_hospitals_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_major_hospitals_optimal'], hex8['travel_time_major_hospitals_optimal'], hospital_bins)
hex8['major_hospitals_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_major_hospitals_optimal'], hex8['travel_time_constructed_sites_major_hospitals_optimal'], hospital_bins)
hex8['primary_schools_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_primary_schools_fixed'], hex8['travel_time_primary_schools_fixed'], school_bins)
hex8['primary_schools_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_primary_schools_fixed'], hex8['travel_time_constructed_sites_primary_schools_fixed'], school_bins)
hex8['secondary_schools_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_secondary_schools_fixed'], hex8['travel_time_secondary_schools_fixed'], school_bins)
hex8['secondary_schools_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_secondary_schools_fixed'], hex8['travel_time_constructed_sites_secondary_schools_fixed'], school_bins)
hex8['semi_dense_urban_potential_impact'] = calculate_impact_score(hex8['travel_time_no_sites_semi_dense_urban_optimal'], hex8['travel_time_semi_dense_urban_optimal'], market_bins)
hex8['semi_dense_urban_current_impact'] = calculate_impact_score(hex8['travel_time_no_sites_semi_dense_urban_optimal'], hex8['travel_time_constructed_sites_semi_dense_urban_optimal'], market_bins)

In [117]:
hex8['total_potential_impact'] = hex8['health_centers_potential_impact'] + hex8['health_posts_potential_impact'] + hex8['major_hospitals_potential_impact'] + hex8['primary_schools_potential_impact'] + hex8['secondary_schools_potential_impact'] + hex8['semi_dense_urban_potential_impact']
hex8['total_current_impact'] = hex8['health_centers_current_impact'] + hex8['health_posts_current_impact'] + hex8['major_hospitals_current_impact'] + hex8['primary_schools_current_impact'] + hex8['secondary_schools_current_impact'] + hex8['semi_dense_urban_current_impact']
hex8['total_school_current_impact'] =  hex8['primary_schools_current_impact'] + hex8['secondary_schools_current_impact']
hex8['total_school_potential_impact'] =  hex8['primary_schools_potential_impact'] + hex8['secondary_schools_potential_impact']
hex8['total_health_current_impact'] = hex8['health_centers_current_impact'] + hex8['health_posts_current_impact'] + hex8['major_hospitals_current_impact']
hex8['total_health_potential_impact'] = hex8['health_centers_potential_impact'] + hex8['health_posts_potential_impact'] + hex8['major_hospitals_potential_impact']

# Create a dataMap dictionary for the front end application

In [None]:
# Run if starting here
# hex8_path = './unsynced-data/rwanda/rwa_travel_time_hex-8-staging.geojson'
# hex8 = gpd.read_file(hex8_path) 
# hex8

In [118]:
# Run this if ammending a dictionary vs. creating a new one
# read './synced-data/dataMap.json' as json
with open('./synced-data/data-map.json', 'r') as f:
    data_map = json.load(f)



In [135]:
def update_data_map(df):
    # for key in data_map 3 onward
    for key in data_map:
        if key in ["key", "h3-index", "row_col"]:
            print("skipping " + key)
        else:
            if key not in data_map:
                print("key is not in data_map")
            else:
                data_map[key][3]["max"] = round(hex8[key].max(), 4)
                data_map[key][3]["min"] = round(hex8[key].min(), 4)
                data_map[key][3]["mean"] = round(hex8[key].mean(), 4)
                data_map[key][3]["median"] = round(hex8[key].median(), 4)
                data_map[key][3]["98_percentile"] = round(hex8[key].quantile(.98), 4)
                data_map[key][3]["99_percentile"] = round(hex8[key].quantile(.99), 4)
                data_map[key][3]["2_percentile"] = round(hex8[key].quantile(.02), 4)
                data_map[key][3]["1_percentile"] = round(hex8[key].quantile(.01), 4)
                data_map[key][3]["quantiles"] = [round(hex8[key].quantile(0), 4), round(hex8[key].quantile(0.1), 4), round(hex8[key].quantile(0.2), 4), round(hex8[key].quantile(0.3), 4), round(hex8[key].quantile(0.4), 4), round(hex8[key].quantile(0.5), 4), round(hex8[key].quantile(0.6), 4), round(hex8[key].quantile(0.7), 4), round(hex8[key].quantile(0.8), 4), round(hex8[key].quantile(0.9), 4), round(hex8[key].quantile(1), 4)]
                data_map[key][3]["style_stops_99"] = [round(hex8[key].quantile(.01), 4), round(((hex8[key].quantile(.99)-hex8[key].quantile(.01))*0.25)+hex8[key].quantile(.01), 4), round(((hex8[key].quantile(.99)-hex8[key].quantile(.01)) * 0.5)+hex8[key].quantile(.01), 4), round(((hex8[key].quantile(.99)-hex8[key].quantile(.01))*0.75)+hex8[key].quantile(.01), 4), round(hex8[key].quantile(.99), 4)]
                data_map[key][3]["style_stops_98"] = [round(hex8[key].quantile(.02), 4), round(((hex8[key].quantile(.98)-hex8[key].quantile(.02))*0.25)+hex8[key].quantile(.02), 4), round(((hex8[key].quantile(.98)-hex8[key].quantile(.02)) * 0.5)+hex8[key].quantile(.02), 4), round(((hex8[key].quantile(.98)-hex8[key].quantile(.02))*0.75)+hex8[key].quantile(.02), 4), round(hex8[key].quantile(.98), 4)]
                data_map[key][3]["style_stops_maxmin"] = [round(hex8[key].min(), 4), round(((hex8[key].max()-hex8[key].min())*0.25)+hex8[key].min(), 4), round(((hex8[key].max()-hex8[key].min())*0.5)+hex8[key].min(), 4), round(((hex8[key].max()-hex8[key].min())*0.75)+hex8[key].min(), 4), round(hex8[key].max(), 4)]
            
            data_map[key][3] = {
                k: pd.to_numeric(v) if isinstance(v, pd.Series) else v
                for k, v in data_map[key][3].items()
            }
    return data_map


In [136]:
data_map = update_data_map(hex8)
data_map

skipping key
skipping h3-index
skipping row_col


{'key': ['Menu Options',
  'hover panel text',
  'color scheme',
  'percentiles',
  'legend labels'],
 'h3-index': [['N/A'], ['H3 Index', ''], 'rainbow'],
 'row_col': [['N/A'], ['Row/Col', ''], 'rainbow'],
 'female_educational_attainment_mean': [['Demographics',
   'Female Education',
   'N/A'],
  ['Avg. Female Education', 'years'],
  'rdylgn',
  {'max': 8.7257,
   'min': 3.3976,
   'mean': 4.6791,
   'median': 4.5781,
   '98_percentile': 6.6929,
   '99_percentile': 7.2846,
   '2_percentile': 3.7639,
   '1_percentile': 3.7098,
   'quantiles': [3.3976,
    4.0175,
    4.2098,
    4.3351,
    4.46,
    4.5781,
    4.6959,
    4.8412,
    5.0483,
    5.339,
    8.7257],
   'style_stops_99': [3.7098, 4.6035, 5.4972, 6.3909, 7.2846],
   'style_stops_98': [3.7639, 4.4962, 5.2284, 5.9606, 6.6929],
   'style_stops_maxmin': [3.3976, 4.7296, 6.0617, 7.3937, 8.7257]},
  ['Less Educated', 'More Educated']],
 'females_0_4': [['Population', 'Female 0-4', 'N/A'],
  ['Population', 'females'],
  'magma

In [138]:
def custom_encoder(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

# Serialize the dictionary to a JSON file
with open("./synced-data/data-map.json", "w") as json_file:
    json.dump(data_map, json_file, default=custom_encoder, indent=4)
    

# Writing final outputs
Writing outputs to geojson file and optimizing for size

In [6]:
# dropping row_col column to save space
hex8 = hex8.drop(['row_col'], axis=1)

In [7]:
# Write to file
output_path = "./unsynced-data/rwanda/rwa_travel_time_hex-8-staging.geojson"
hex8.to_file(output_path, driver='GeoJSON', index=False)