In [1]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import LineString, Point
import json
import os

In [2]:
def convert_to_geodataframe_points(df):
    # Create a copy of the dataframe to avoid modifying the original
    gdf = df.copy()
    
    # Convert the coordinates to shapely Point objects
    gdf['geometry'] = gdf['coords'].apply(lambda x: Point(x[0], x[1]))  # Note: Point(lat, lon)
    
    # Create the GeoDataFrame
    gdf = gpd.GeoDataFrame(gdf, geometry='geometry')
    
    # Set the coordinate reference system (CRS) to WGS84
    gdf.set_crs(epsg=4326, inplace=True)
    # drop coords column
    gdf = gdf.drop(columns=['coords'])
    #rename index column
    gdf.rename(columns={"index": "bridge_index"}, inplace=True)
    
    return gdf

In [5]:
def drop_unused_columns(df):
    # drop columns that are not needed
    drop_cols = ["type", "geometry", "subregion_indices"]
    for c in df.columns:
        if c.startswith("used_by_cells"):
            drop_cols.append(c)
    for c in df.columns:
        if "_fixed" in c:
            drop_cols.append(c)
    return df.drop(columns=drop_cols)

In [25]:
def convert_array_columns(df):
    df_fixed = df.copy()
    print(f"Starting conversion process for {len(df.columns)} columns")

    # Loop through each column
    complex_cols_found = 0
    for i, col in enumerate(df.columns):
        print(f"Processing column {i+1}/{len(df.columns)}: '{col}'")
        
        # Check if column contains complex data types
        has_complex = df[col].apply(lambda x: isinstance(x, (np.ndarray, list, dict))).any()
        
        if has_complex:
            complex_cols_found += 1
            print(f"  Found complex data in column '{col}' - converting...")
            
            # Sample the first complex value for debugging
            sample_idx = df[col].apply(lambda x: isinstance(x, (np.ndarray, list, dict))).idxmax()
            sample_val = df.loc[sample_idx, col]
            print(f"  Sample value before conversion: {type(sample_val)}, first few items: {str(sample_val)[:50]}...")
            
            # Convert complex values to JSON strings
            df_fixed[col] = df[col].apply(
                lambda x: json.dumps(x.tolist() if isinstance(x, np.ndarray) else x) 
                if isinstance(x, (np.ndarray, list, dict)) else x
            )
            
            # Verify conversion worked
            new_sample_val = df_fixed.loc[sample_idx, col]
            print(f"  Sample value after conversion: {type(new_sample_val)}, preview: {str(new_sample_val)[:50]}...")

    print(f"Conversion complete! Found and converted {complex_cols_found} columns with complex data types")
    
    return df_fixed

In [26]:
def process_data(file_path):
    # Load the data
    df = pd.read_parquet(file_path)
    print(f"Loaded data with shape: {df.shape}")

     # Drop unused columns
    df = drop_unused_columns(df)
    print(f"Dropped unused columns, new shape: {df.shape}")

    # Convert the data to a GeoDataFrame
    df = convert_to_geodataframe_points(df)
    print(f"Converted to GeoDataFrame with shape: {df.shape}")

    # Convert array columns to JSON strings
    df = convert_array_columns(df)
    print(f"Converted columns. New shape: {df.shape}")

    return df

In [27]:
# paths
local_path = "/Volumes/samsung-4tb/b2p/impact-model/no_order_1_less_than_500m_with_top_sites/model_outputs/"
bridge_view_path = "joined_data/bridge_view_data.parquet"
output_folder_path ="/Volumes/samsung-4tb/b2p/impact-model/cleaned_data/"

et_path = os.path.join(local_path, "ethiopia/",bridge_view_path)
civ_path = os.path.join(local_path, "ivory_coast/",bridge_view_path)
kenya_path = os.path.join(local_path, "kenya/",bridge_view_path)
rwanda_path = os.path.join(local_path, "rwanda/",bridge_view_path)
tanzania_path = os.path.join(local_path, "tanzania/",bridge_view_path)
uganda_path = os.path.join(local_path, "uganda/",bridge_view_path)
zambia_path = os.path.join(local_path, "zambia/",bridge_view_path)


In [30]:
et_bridges = process_data(et_path)

Loaded data with shape: (46246, 24)
Dropped unused columns, new shape: (46246, 9)
Converted to GeoDataFrame with shape: (46246, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['886a4b769bfffff' '886a4b769bfffff' '886a4b7691ff...
  Sample value after conversion: <class 'str'>, preview: ["886a4b769bfffff", "886a4b769bfffff", "886a4b7691...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['886a4b769bfffff' '886a4b769bfffff' '886a4b7691ff...
  Sample value after conversion: <class 'str'>, preview: ["886a4b769bfffff",

In [31]:
civ_bridges = process_data(civ_path)

Loaded data with shape: (9048, 24)
Dropped unused columns, new shape: (9048, 9)
Converted to GeoDataFrame with shape: (9048, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['8874368f33fffff' '8874368f33fffff']...
  Sample value after conversion: <class 'str'>, preview: ["8874368f33fffff", "8874368f33fffff"]...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['8874368dd9fffff' '8874368ce5fffff' '8874368ce5ff...
  Sample value after conversion: <class 'str'>, preview: ["8874368dd9fffff", "8874368ce5fffff", "8874368

In [32]:
kenya_bridges = process_data(kenya_path)

Loaded data with shape: (13191, 24)
Dropped unused columns, new shape: (13191, 9)
Converted to GeoDataFrame with shape: (13191, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['887b4a0633fffff']...
  Sample value after conversion: <class 'str'>, preview: ["887b4a0633fffff"]...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['887b4a0633fffff' '887b4a0623fffff' '887b4a0621ff...
  Sample value after conversion: <class 'str'>, preview: ["887b4a0633fffff", "887b4a0623fffff", "887b4a0621...
Processing column 5/9: 'use

In [33]:
rwanda_bridges = process_data(rwanda_path)

Loaded data with shape: (1010, 24)
Dropped unused columns, new shape: (1010, 9)
Converted to GeoDataFrame with shape: (1010, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['886ada8f0bfffff' '886ada8f09fffff' '886ada8f09ff...
  Sample value after conversion: <class 'str'>, preview: ["886ada8f0bfffff", "886ada8f09fffff", "886ada8f09...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['886ada8f0bfffff' '886ada8f09fffff' '886ada8f09ff...
  Sample value after conversion: <class 'str'>, preview: ["886ada8f0bfffff", "8

In [34]:
tanzania_bridges = process_data(tanzania_path)

Loaded data with shape: (25616, 24)
Dropped unused columns, new shape: (25616, 9)
Converted to GeoDataFrame with shape: (25616, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['8896a471d1fffff' '8896a471d1fffff' '8896a471d1ff...
  Sample value after conversion: <class 'str'>, preview: ["8896a471d1fffff", "8896a471d1fffff", "8896a471d1...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['8896a47189fffff' '8896a47189fffff' '8896a47189ff...
  Sample value after conversion: <class 'str'>, preview: ["8896a47189fffff",

In [35]:
uganda_bridges = process_data(uganda_path)

Loaded data with shape: (5951, 24)
Dropped unused columns, new shape: (5951, 9)
Converted to GeoDataFrame with shape: (5951, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['886adc2951fffff' '886adc2951fffff' '886adc2951ff...
  Sample value after conversion: <class 'str'>, preview: ["886adc2951fffff", "886adc2951fffff", "886adc2951...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['886adc2aedfffff' '886adc2aedfffff' '886adc2aedff...
  Sample value after conversion: <class 'str'>, preview: ["886adc2aedfffff", "8

In [36]:
zambia_bridges = process_data(zambia_path)

Loaded data with shape: (15212, 24)
Dropped unused columns, new shape: (15212, 9)
Converted to GeoDataFrame with shape: (15212, 9)
Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
  Found complex data in column 'used_by_h3_for_semi_dense_urban_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['88975140ebfffff']...
  Sample value after conversion: <class 'str'>, preview: ["88975140ebfffff"]...
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
  Found complex data in column 'used_by_h3_for_health_posts_optimal' - converting...
  Sample value before conversion: <class 'numpy.ndarray'>, first few items: ['88975140e1fffff' '88975140e1fffff' '88975140e1ff...
  Sample value after conversion: <class 'str'>, preview: ["88975140e1fffff", "88975140e1fffff", "88975140e1...
Processing column 5/9: 'use

In [37]:
# join all geodataframes
all_bridges = pd.concat([et_bridges, civ_bridges, kenya_bridges, rwanda_bridges, tanzania_bridges, uganda_bridges, zambia_bridges])

In [38]:
all_bridges

Unnamed: 0,bridge_index,exit_point_index,used_by_h3_for_semi_dense_urban_optimal,used_by_h3_for_health_posts_optimal,used_by_h3_for_all_health_facilities_optimal,used_by_h3_for_health_centers_optimal,used_by_h3_for_major_hospitals_optimal,used_by_h3_for_major_roads_optimal,geometry
0,302351,302352,"[""886a4b769bfffff"", ""886a4b769bfffff"", ""886a4b...","[""886a4b769bfffff"", ""886a4b769bfffff"", ""886a4b...","[""886a4b39a7fffff"", ""886a4b39a7fffff"", ""886a4b...","[""886a4b769bfffff"", ""886a4b769bfffff"", ""886a4b...",[],[],POINT (37.46229 4.84980)
1,302352,302353,"[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...","[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...","[""886a4b39a1fffff"", ""886a4b39a1fffff"", ""886a4b...","[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...",[],[],POINT (37.46595 4.85105)
2,302353,302354,"[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...","[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...","[""886a4b39a1fffff"", ""886a4b39a9fffff"", ""886a4b...","[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...",[],[],POINT (37.46508 4.85778)
3,302354,302355,"[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...","[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...","[""886a4b39e7fffff"", ""886a4b39a9fffff""]","[""886a4b2b65fffff"", ""886a4b2b65fffff"", ""886a4b...",[],"[""886a4b39e7fffff"", ""886a4b39a9fffff"", ""886a4b...",POINT (37.45727 4.86828)
4,302355,302356,"[""886a4b3937fffff"", ""886a4b3937fffff"", ""886a4b...","[""886a4b3937fffff"", ""886a4b3937fffff"", ""886a4b...","[""886a4b76d9fffff"", ""886a4b76d9fffff"", ""886a4b...","[""886a4b3937fffff"", ""886a4b3937fffff"", ""886a4b...",[],"[""886a4b76d9fffff"", ""886a4b76d9fffff"", ""886a4b...",POINT (37.45522 4.87006)
...,...,...,...,...,...,...,...,...,...
15207,105971,105972,"[""8896a9a269fffff"", ""8896a9b197fffff"", ""8896a9...","[""8896a9a269fffff"", ""8896a9b197fffff"", ""8896a9...","[""8896a9a269fffff"", ""8896a9b197fffff"", ""8896a9...","[""8896a9a269fffff"", ""8896a9b197fffff"", ""8896a9...","[""8896a9a269fffff"", ""8896a9b197fffff"", ""8896a9...",[],POINT (29.63156 -8.49877)
15208,105972,105973,"[""8896a9a345fffff"", ""8896a9a345fffff"", ""8896a9...","[""8896a9a225fffff"", ""8896a9a225fffff"", ""8896a9...","[""8896a9a345fffff"", ""8896a9a345fffff"", ""8896a9...","[""8896a9a345fffff"", ""8896a9a345fffff"", ""8896a9...","[""8896a9a225fffff"", ""8896a9a225fffff"", ""8896a9...",[],POINT (29.63656 -8.48563)
15209,105973,105974,"[""8896f4d2d3fffff"", ""8896f4d2d3fffff"", ""8896f4...","[""8896f4d2d3fffff"", ""8896f4d2d3fffff"", ""8896f4...","[""8896f4d287fffff"", ""8896f4d287fffff"", ""8896f4...","[""8896f4d287fffff"", ""8896f4d287fffff"", ""8896f4...","[""8896f4d287fffff"", ""8896f4d287fffff"", ""8896f4...",[],POINT (29.11544 -8.47639)
15210,105974,105975,"[""8896a98cb3fffff"", ""8896a98cb3fffff"", ""8896a9...","[""8896a9b91dfffff"", ""8896a9b91dfffff"", ""8896a9...","[""8896a9b91dfffff"", ""8896a9b91dfffff"", ""8896a9...","[""8896a9b90dfffff"", ""8896a9b96bfffff"", ""8896a9...","[""8896a98cb3fffff"", ""8896a98cb3fffff"", ""8896a9...",[],POINT (29.75404 -8.41852)


In [39]:
# write each individual country to geojson
et_bridges.to_file(os.path.join(output_folder_path, "ethiopia_bridges.geojson"), driver='GeoJSON')
print ("Ethiopia bridges written to geojson")
civ_bridges.to_file(os.path.join(output_folder_path, "ivory_coast_bridges.geojson"), driver='GeoJSON')
print("Ivory Coast bridges written to geojson")
kenya_bridges.to_file(os.path.join(output_folder_path, "kenya_bridges.geojson"), driver='GeoJSON')
print("Kenya bridges written to geojson")
rwanda_bridges.to_file(os.path.join(output_folder_path, "rwanda_bridges.geojson"), driver='GeoJSON')
print("Rwanda bridges written to geojson")
tanzania_bridges.to_file(os.path.join(output_folder_path, "tanzania_bridges.geojson"), driver='GeoJSON')
print("Tanzania bridges written to geojson")
uganda_bridges.to_file(os.path.join(output_folder_path, "uganda_bridges.geojson"), driver='GeoJSON')
print("Uganda bridges written to geojson")
zambia_bridges.to_file(os.path.join(output_folder_path, "zambia_bridges.geojson"), driver='GeoJSON')
print("Zambia bridges written to geojson")

Ethiopia bridges written to geojson
Ivory Coast bridges written to geojson
Kenya bridges written to geojson
Rwanda bridges written to geojson
Tanzania bridges written to geojson
Uganda bridges written to geojson
Zambia bridges written to geojson


In [40]:
# write to geojson
all_bridges = convert_array_columns(all_bridges)
all_bridges = gpd.GeoDataFrame(all_bridges, geometry='geometry')
all_bridges.set_crs(epsg=4326, inplace=True)
all_bridges.to_file(os.path.join(output_folder_path, "all_bridges.geojson"), driver='GeoJSON')
print("Done!")

Starting conversion process for 9 columns
Processing column 1/9: 'bridge_index'
Processing column 2/9: 'exit_point_index'
Processing column 3/9: 'used_by_h3_for_semi_dense_urban_optimal'
Processing column 4/9: 'used_by_h3_for_health_posts_optimal'
Processing column 5/9: 'used_by_h3_for_all_health_facilities_optimal'
Processing column 6/9: 'used_by_h3_for_health_centers_optimal'
Processing column 7/9: 'used_by_h3_for_major_hospitals_optimal'
Processing column 8/9: 'used_by_h3_for_major_roads_optimal'
Processing column 9/9: 'geometry'
Conversion complete! Found and converted 0 columns with complex data types
Done!
