In [None]:
import geopandas as pd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import LineString, Point
import json
import os
import gc

In [3]:
# paths
local_path = "/Volumes/samsung-4tb/b2p/impact-model/no_order_1_less_than_500m_with_top_sites/model_outputs/"
path_data_path = "joined_data/travel_paths_to_all_education_facilities_fixed.parquet"
output_folder_path ="/Volumes/samsung-4tb/b2p/impact-model/cleaned_data/path_data/"

health_path = "joined_data/travel_paths_to_all_health_facilities_optimal.parquet"
edu_path = "joined_data/travel_paths_to_all_education_facilities_fixed.parquet"
market_path = "joined_data/travel_time_to_semi_dense_urban_optimal.parquet"


et_path = os.path.join(local_path, "ethiopia/")
civ_path = os.path.join(local_path, "ivory_coast/")
kenya_path = os.path.join(local_path, "kenya/")
rwanda_path = os.path.join(local_path, "rwanda/")
tanzania_path = os.path.join(local_path, "tanzania/")
uganda_path = os.path.join(local_path, "uganda/")
zambia_path = os.path.join(local_path, "zambia/")


In [3]:
# read each health data set by taking local path + each country path + health_path

et_health = pd.read_parquet(os.path.join(et_path, health_path))
print ("Ethiopia health data loaded:", et_health.shape)
civ_health = pd.read_parquet(os.path.join(civ_path, health_path))
print ("Ivory Coast health data loaded:", civ_health.shape)
kenya_health = pd.read_parquet(os.path.join(kenya_path, health_path))
print ("Kenya health data loaded:", kenya_health.shape)
rwanda_health = pd.read_parquet(os.path.join(rwanda_path, health_path))
print ("Rwanda health data loaded:", rwanda_health.shape)
uganda_health = pd.read_parquet(os.path.join(uganda_path, health_path))
print ("Uganda health data loaded:", uganda_health.shape)
zambia_health = pd.read_parquet(os.path.join(zambia_path, health_path))
print ("Zambia health data loaded:", zambia_health.shape)
tanzania_health = pd.read_parquet(os.path.join(tanzania_path, health_path))
print ("Tanzania health data loaded:", tanzania_health.shape)


Ethiopia health data loaded: (21679729, 16)
Ivory Coast health data loaded: (4965427, 16)
Kenya health data loaded: (10885694, 16)
Rwanda health data loaded: (1216780, 16)
Uganda health data loaded: (6579988, 16)


: 

In [4]:
# Start with the first dataset
combined_health = pd.read_parquet(os.path.join(et_path, health_path))
print(f"Starting with Ethiopia: {combined_health.shape}")

# List of remaining paths
remaining_paths = [
    (civ_path, "Ivory Coast"),
    (kenya_path, "Kenya"), 
    (rwanda_path, "Rwanda"),
    (uganda_path, "Uganda"),
    (zambia_path, "Tanzania"),
    (tanzania_path, "Tanzania")
]

for country_path, country_name in remaining_paths:
    try:
        # Load next dataset
        temp_df = pd.read_parquet(os.path.join(country_path, health_path))
        print(f"Loaded {country_name}: {temp_df.shape}")
        
        # Concatenate with existing data
        combined_health = pd.concat([combined_health, temp_df], ignore_index=True)
        print(f"Combined shape after {country_name}: {combined_health.shape}")
        
        # Free memory
        del temp_df
        gc.collect()
        
    except Exception as e:
        print(f"Error processing {country_name}: {e}")

Starting with Ethiopia: (21679729, 16)
Loaded Ivory Coast: (4965427, 16)
Combined shape after Ivory Coast: (26645156, 16)
Loaded Kenya: (10885694, 16)
Combined shape after Kenya: (37530850, 16)
Loaded Rwanda: (1216780, 16)
Combined shape after Rwanda: (38747630, 16)
Loaded Uganda: (6579988, 16)
Combined shape after Uganda: (45327618, 16)


: 

In [None]:
print("All health data combined:", combined_health.shape)

In [None]:
# write to parquet file
output_health_path = os.path.join(output_folder_path, "travel_paths_to_all_health_facilities_optimal.parquet")
combined_health.to_parquet(output_health_path)

In [None]:
# read each edu data set by taking local path + each country path + edu_path
et_edu = pd.read_parquet(os.path.join(et_path, edu_path))
print ("Ethiopia education data loaded:", et_edu.shape)
civ_edu = pd.read_parquet(os.path.join(civ_path, edu_path))
print ("Ivory Coast education data loaded:", civ_edu.shape)
kenya_edu = pd.read_parquet(os.path.join(kenya_path, edu_path))
print ("Kenya education data loaded:", kenya_edu.shape)
rwanda_edu = pd.read_parquet(os.path.join(rwanda_path, edu_path))
print ("Rwanda education data loaded:", rwanda_edu.shape)
tanzania_edu = pd.read_parquet(os.path.join(tanzania_path, edu_path))
print ("Tanzania education data loaded:", tanzania_edu.shape)
uganda_edu = pd.read_parquet(os.path.join(uganda_path, edu_path))
print ("Uganda education data loaded:", uganda_edu.shape)
zambia_edu = pd.read_parquet(os.path.join(zambia_path, edu_path))
print ("Zambia education data loaded:", zambia_edu.shape)

In [5]:
# read et_path as geopandas df

civ_path_df = pd.read_parquet(civ_path)
civ_path_df

Unnamed: 0,row,col,subregion,coords,exit_point_used,exit_point_index,destination_index,destination_coords,exit_point_index_path,h3_index,exit_point_path,path_to_exit_point,exit_point_used_no_sites,exit_point_path_no_sites,path_to_exit_point_no_sites
0,3653,1262,21093,"[-7.5474, 7.69195]","[-7.5474, 7.69195]",230524,2095,"[-7.47986, 7.6695]","[230524, 83597, 83598, 83599, 83600, 83601, 83...",8875a19995fffff,"[[-7.5474, 7.69195], [-7.5409, 7.69699], [-7.5...","[[-7.5474, 7.69195]]",,,
1,4463,1478,27651,"[-7.3675, 7.016667]","[-7.3744, 7.01404]",83875,1485,"[-7.39222, 7.0519]","[83875, 82970, 82971, 82972, 82973]",8875a12459fffff,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....","[[-7.3675, 7.016667], [-7.3683333, 7.0158334],...",,,
2,4464,1477,27651,"[-7.3683333, 7.0158334]","[-7.3744, 7.01404]",83875,1485,"[-7.39222, 7.0519]","[83875, 82970, 82971, 82972, 82973]",8875a12459fffff,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....","[[-7.3683333, 7.0158334], [-7.3683333, 7.01500...",,,
3,4465,1477,27651,"[-7.3683333, 7.0150003]","[-7.3744, 7.01404]",83875,1485,"[-7.39222, 7.0519]","[83875, 82970, 82971, 82972, 82973]",8875a12459fffff,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....","[[-7.3683333, 7.0150003], [-7.3683333, 7.01416...",,,
4,4466,1471,27651,"[-7.373462, 7.0139494]","[-7.3744, 7.01404]",83875,1485,"[-7.39222, 7.0519]","[83875, 82970, 82971, 82972, 82973]",8875a12451fffff,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....","[[-7.373462, 7.0139494], [-7.374401, 7.0140433...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4964078,6807,2094,45547,"[-6.8541665, 5.0633335]","[-6.85284, 5.06473]",104919,170,"[-6.61776, 5.16575]","[104919, 261139, 104917, 104918, 208015, 10492...",8875a96f0dfffff,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...","[[-6.8541665, 5.0633335], [-6.8533335, 5.06416...",,"[[-6.86037, 5.05999], [-6.86074, 5.05988], [-6...","[[-6.8541665, 5.0633335], [-6.855, 5.0633335],..."
4964079,6808,2090,45547,"[-6.8575, 5.0625]","[-6.85284, 5.06473]",104919,170,"[-6.61776, 5.16575]","[104919, 261139, 104917, 104918, 208015, 10492...",8875a96f0dfffff,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...","[[-6.8575, 5.0625], [-6.8575, 5.0633335], [-6....",,"[[-6.86037, 5.05999], [-6.86074, 5.05988], [-6...","[[-6.8575, 5.0625], [-6.858333, 5.061667], [-6..."
4964080,6809,2089,45547,"[-6.858333, 5.061667]","[-6.85284, 5.06473]",104919,170,"[-6.61776, 5.16575]","[104919, 261139, 104917, 104918, 208015, 10492...",8875a96f0dfffff,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...","[[-6.858333, 5.061667], [-6.8575, 5.0625], [-6...",,"[[-6.86037, 5.05999], [-6.86074, 5.05988], [-6...","[[-6.858333, 5.061667], [-6.8591666, 5.0608335]]"
4964081,6810,2088,45547,"[-6.8591666, 5.0608335]","[-6.85284, 5.06473]",104919,170,"[-6.61776, 5.16575]","[104919, 261139, 104917, 104918, 208015, 10492...",8875a96f0dfffff,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...","[[-6.8591666, 5.0608335], [-6.858333, 5.061667...",,"[[-6.86037, 5.05999], [-6.86074, 5.05988], [-6...","[[-6.8591666, 5.0608335]]"


In [39]:
# what is the max median and min length of the path_to_exit_point array column
civ_path_df['path_to_exit_point'].apply(lambda x: len(x)).describe()

count    4.964083e+06
mean     1.143990e+01
std      1.119403e+01
min      0.000000e+00
25%      4.000000e+00
50%      9.000000e+00
75%      1.600000e+01
max      1.600000e+02
Name: path_to_exit_point, dtype: float64

In [8]:
civ_path_columns = civ_path_df.columns.tolist()
civ_path_columns

['row',
 'col',
 'subregion',
 'coords',
 'exit_point_used',
 'exit_point_index',
 'destination_index',
 'destination_coords',
 'exit_point_index_path',
 'h3_index',
 'exit_point_path',
 'path_to_exit_point',
 'exit_point_used_no_sites',
 'exit_point_path_no_sites',
 'path_to_exit_point_no_sites']

In [10]:
# create a new dataframe with only exit_point_path and h3_index columns
exit_point_paths = civ_path_df[['exit_point_path', 'h3_index']]

In [19]:
# convert exit_point_path to a geodataframe with the exit_point_path as a line string geometry
exit_point_paths['geometry'] = exit_point_paths['exit_point_path'].apply(
    lambda x: LineString(x) if len(x) > 1 else Point(x[0])
)

TypeError: object of type 'NoneType' has no len()

In [27]:
print (exit_point_paths['exit_point_path'][0])

for i in exit_point_paths['exit_point_path'][0]:
    print(i)

[array([-7.5474 ,  7.69195], dtype=float32)
 array([-7.5409 ,  7.69699], dtype=float32)
 array([-7.52889,  7.69456], dtype=float32)
 array([-7.52528,  7.69327], dtype=float32)
 array([-7.51482,  7.68045], dtype=float32)
 array([-7.51141,  7.67603], dtype=float32)
 array([-7.50798,  7.67117], dtype=float32)
 array([-7.50352,  7.66823], dtype=float32)
 array([-7.49819,  7.669  ], dtype=float32)
 array([-7.49356,  7.66919], dtype=float32)
 array([-7.47986,  7.6695 ], dtype=float32)]
[-7.5474   7.69195]
[-7.5409   7.69699]
[-7.52889  7.69456]
[-7.52528  7.69327]
[-7.51482  7.68045]
[-7.51141  7.67603]
[-7.50798  7.67117]
[-7.50352  7.66823]
[-7.49819  7.669  ]
[-7.49356  7.66919]
[-7.47986  7.6695 ]


In [None]:
def create_linestring(x):
    if x is None:
        return None
    
    try:
        # If x is a list/array of coordinate pairs
        if len(x) >= 2:  # Need at least 2 points for a LineString
            # Convert to list of (x, y) tuples
            coords = [(point[0], point[1]) for point in x]
            return LineString(coords)
        elif len(x) == 1:  # Single point
            return Point(x[0][0], x[0][1])
        else:
            return None
    except Exception as e:
        print(f"Error processing: {x}, Error: {e}")
        return None

# Apply the function to create geometries
exit_point_paths['geometry'] = exit_point_paths['exit_point_path'].apply(create_linestring)

# Convert to GeoDataFrame
gdf = pd.GeoDataFrame(exit_point_paths, geometry='geometry')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exit_point_paths['geometry'] = exit_point_paths['exit_point_path'].apply(create_linestring)


In [30]:
gdf

Unnamed: 0,exit_point_path,h3_index,geometry
0,"[[-7.5474, 7.69195], [-7.5409, 7.69699], [-7.5...",8875a19995fffff,"LINESTRING (-7.54740 7.69195, -7.54090 7.69699..."
1,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....",8875a12459fffff,"LINESTRING (-7.37440 7.01404, -7.40107 7.04269..."
2,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....",8875a12459fffff,"LINESTRING (-7.37440 7.01404, -7.40107 7.04269..."
3,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....",8875a12459fffff,"LINESTRING (-7.37440 7.01404, -7.40107 7.04269..."
4,"[[-7.3744, 7.01404], [-7.40107, 7.04269], [-7....",8875a12451fffff,"LINESTRING (-7.37440 7.01404, -7.40107 7.04269..."
...,...,...,...
4964078,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...",8875a96f0dfffff,"LINESTRING (-6.85284 5.06473, -6.84043 5.06062..."
4964079,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...",8875a96f0dfffff,"LINESTRING (-6.85284 5.06473, -6.84043 5.06062..."
4964080,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...",8875a96f0dfffff,"LINESTRING (-6.85284 5.06473, -6.84043 5.06062..."
4964081,"[[-6.85284, 5.06473], [-6.84043, 5.06062], [-6...",8875a96f0dfffff,"LINESTRING (-6.85284 5.06473, -6.84043 5.06062..."


In [38]:
# set to epsg:4326
gdf.set_crs(epsg=4326, inplace=True)
gdf.geometry.head(1000).explore()