In [None]:
import pandas as pd
import geopandas as gpd
from rasterio.features import rasterize
import rasterio
from rasterio.transform import from_bounds
import numpy as np
from shapely.geometry import box
import os


In [2]:
output_folder_path ="/Volumes/samsung-4tb/b2p/impact-model/cleaned_data/hex_data/"
input_data_path = os.path.join(output_folder_path, "all_countries_merged_hex8.parquet")

In [4]:

def h3_to_raster_efficient(gdf, resolution, output_path, data_columns):
    """More efficient version using rasterio.features.rasterize"""
    # Determine data columns
    if data_columns is None:
        data_columns = [col for col in gdf.columns if col != 'geometry' and col != 'h3_index']
    
    # Get bounds
    total_bounds = gdf.total_bounds
    
    # Calculate dimensions
    width = int((total_bounds[2] - total_bounds[0]) / resolution)
    height = int((total_bounds[3] - total_bounds[1]) / resolution)
    
    # Create transform
    transform = from_bounds(total_bounds[0], total_bounds[1], 
                           total_bounds[2], total_bounds[3], 
                           width, height)
    
    # Create empty raster data array
    raster_data = np.zeros((len(data_columns), height, width), dtype=np.float32)
    raster_data.fill(np.nan)
    
    # For each data column, create a separate band
    for band_idx, col in enumerate(data_columns):
        # Create shapes list for rasterize function
        shapes = [(geom, value) for geom, value in zip(gdf.geometry, gdf[col])]
        
        # Rasterize
        band_data = rasterize(
            shapes=shapes,
            out_shape=(height, width),
            transform=transform,
            fill=np.nan,
            all_touched=False,
            dtype=np.float32
        )
        
        raster_data[band_idx] = band_data
    
    # Write to file
    with rasterio.open(
        output_path,
        'w',
        driver='GTiff',
        height=height,
        width=width,
        count=len(data_columns),
        dtype=raster_data.dtype,
        crs=gdf.crs,
        transform=transform,
        nodata=np.nan
    ) as dst:
        for band_idx, col in enumerate(data_columns):
            dst.write(raster_data[band_idx], band_idx + 1)
            dst.set_band_description(band_idx + 1, col)
    
    return output_path

In [5]:
def h3_to_raster_chunked(gdf, resolution, output_path, data_columns=None, chunk_size=1000):
    """Memory-efficient version using chunking"""
    import math
    
    # Determine data columns
    if data_columns is None:
        data_columns = [col for col in gdf.columns if col != 'geometry' and col != 'h3_index']
    
    # Get bounds
    total_bounds = gdf.total_bounds
    
    # Calculate dimensions
    width = int((total_bounds[2] - total_bounds[0]) / resolution)
    height = int((total_bounds[3] - total_bounds[1]) / resolution)
    
    # Create transform
    transform = from_bounds(total_bounds[0], total_bounds[1], 
                           total_bounds[2], total_bounds[3], 
                           width, height)
    
    # Create the raster file first
    with rasterio.open(
        output_path,
        'w',
        driver='GTiff',
        height=height,
        width=width,
        count=len(data_columns),
        dtype=np.float32,
        crs=gdf.crs,
        transform=transform,
        nodata=np.nan
    ) as dst:
        # Process each data column separately
        for band_idx, col in enumerate(data_columns):
            print(f"Processing column: {col} (band {band_idx+1}/{len(data_columns)})")
            
            # Chunk the geodataframe to process in batches
            num_chunks = math.ceil(len(gdf) / chunk_size)
            
            # Initialize empty raster for this band
            band_data = np.full((height, width), np.nan, dtype=np.float32)
            
            # Process in chunks
            for chunk_idx in range(num_chunks):
                start_idx = chunk_idx * chunk_size
                end_idx = min((chunk_idx + 1) * chunk_size, len(gdf))
                chunk_gdf = gdf.iloc[start_idx:end_idx]
                
                print(f"  Processing chunk {chunk_idx+1}/{num_chunks} ({start_idx}:{end_idx})")
                
                # Create shapes list for this chunk
                shapes = [(geom, value) for geom, value in zip(chunk_gdf.geometry, chunk_gdf[col])]
                
                # Rasterize this chunk and merge with existing data
                if shapes:  # Check if shapes is not empty
                    chunk_raster = rasterize(
                        shapes=shapes,
                        out_shape=(height, width),
                        transform=transform,
                        fill=np.nan,
                        all_touched=False,
                        dtype=np.float32
                    )
                    
                    # Merge with existing data (keep non-nan values)
                    mask = ~np.isnan(chunk_raster)
                    band_data[mask] = chunk_raster[mask]
                
                # Free memory
                del chunk_gdf, shapes
                if 'chunk_raster' in locals():
                    del chunk_raster
                
            # Write the band to the raster file
            dst.write(band_data, band_idx + 1)
            dst.set_band_description(band_idx + 1, col)
            
            # Free memory
            del band_data
    
    return output_path

In [6]:
hex_vector = gpd.read_parquet(input_data_path)
hex_vector

Unnamed: 0,h3_index,population,pop_0_4,females_0_4,males_0_4,pop_5_9,females_5_9,males_5_9,pop_10_14,females_10_14,...,travel_time_no_sites_all_education,travel_time_major_hospitals,travel_time_no_sites_major_hospitals,travel_time_primary_schools,time_delta_no_sites_primary_schools,travel_time_all_education,time_delta_no_sites_all_education,travel_time_no_sites_health_posts,geometry,country_name
0,887512209bfffff,5,1,0,0,0,0,0,0,0,...,,0,0.0,1607,,1599,,358.0,"POLYGON ((-6.50082 7.36543, -6.50476 7.36330, ...",civ
1,8875ae4635fffff,22,3,2,1,3,1,1,2,1,...,884.0,0,0.0,952,0.0,884,0.0,341.0,"POLYGON ((-8.16296 6.42187, -8.16688 6.41973, ...",civ
2,88753244dbfffff,40,7,3,3,6,3,3,4,2,...,367.0,1154,1154.0,367,0.0,367,0.0,271.0,"POLYGON ((-5.39022 9.68681, -5.39424 9.68467, ...",civ
3,8875ab8c3bfffff,8,1,0,0,1,0,0,0,0,...,1426.0,897,897.0,0,0.0,1426,0.0,181.0,"POLYGON ((-7.33689 5.25591, -7.34074 5.25381, ...",civ
4,8875ad3897fffff,25,4,2,2,3,1,1,3,1,...,559.0,998,998.0,956,0.0,559,0.0,272.0,"POLYGON ((-6.22794 6.03084, -6.23181 6.02875, ...",civ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2064788,8896315601fffff,2,0,0,0,0,0,0,0,0,...,479.0,996,996.0,479,0.0,479,0.0,953.0,"POLYGON ((31.07198 -14.27235, 31.07398 -14.267...",zmb
2064789,889606a35dfffff,10,1,0,0,1,0,0,1,0,...,206.0,0,0.0,206,0.0,206,0.0,748.0,"POLYGON ((29.93108 -13.61891, 29.93309 -13.614...",zmb
2064790,88961c1aa1fffff,15,2,1,1,2,1,1,2,1,...,233.0,1066,1066.0,233,0.0,233,0.0,292.0,"POLYGON ((29.26507 -11.22173, 29.26708 -11.216...",zmb
2064791,889631126bfffff,60,10,4,5,8,4,4,7,3,...,84.0,256,256.0,84,0.0,84,0.0,121.0,"POLYGON ((30.90233 -14.56360, 30.90433 -14.558...",zmb


In [None]:
data_cols = [
    "population",
    "births",
    "pregnancies",
    "rwi",
    "underweight",
    "female_educational_attainment_mean",
    "male_educational_attainment_mean",
    "travel_time_no_sites_all_health",
    "time_delta_no_sites_semi_dense_urban",
    "travel_time_health_posts",
    "travel_time_major_roads",
    "travel_time_no_sites_secondary_schools",
    "travel_time_secondary_schools",
    "travel_time_no_sites_health_centers",
    "travel_time_no_sites_major_roads",
    "time_delta_no_sites_secondary_schools",
    "time_delta_no_sites_all_health",
    "travel_time_health_centers",
    "time_delta_no_sites_health_centers",
    "time_delta_no_sites_major_roads",
    "travel_time_semi_dense_urban",
    "time_delta_no_sites_major_hospitals",
    "travel_time_all_health",
    "travel_time_no_sites_primary_schools",
    "travel_time_no_sites_semi_dense_urban",
    "time_delta_no_sites_health_posts",
    "travel_time_no_sites_all_education",
    "travel_time_major_hospitals",
    "travel_time_no_sites_major_hospitals",
    "travel_time_primary_schools",
    "time_delta_no_sites_primary_schools",
    "travel_time_all_education",
    "time_delta_no_sites_all_education",
    "travel_time_no_sites_health_posts",
]

In [8]:
output_file_path = os.path.join(output_folder_path, "all_countries_merged_hex8.tif")

In [10]:
h3_to_raster_efficient(hex_vector, resolution=0.1, output_path=output_file_path, data_columns=data_cols)


'/Volumes/samsung-4tb/b2p/impact-model/cleaned_data/hex_data/all_countries_merged_hex8.tif'

In [11]:
h3_to_raster_efficient(hex_vector, resolution=0.005, output_path=os.path.join(output_folder_path, "all_countries_merged_hex8_005.tif"), data_columns=["travel_time_major_roads", "travel_time_no_sites_all_education"])


'/Volumes/samsung-4tb/b2p/impact-model/cleaned_data/hex_data/all_countries_merged_hex8_005.tif'

In [None]:
for d in data_cols:
    h3_to_raster_efficient(hex_vector, resolution=0.005, output_path=os.path.join(output_folder_path, f"all_countries_merged_hex8_{d}.tif"), data_columns=[d])