# Spatial Data Collection and Preprocessing

OpenStreet Map Data......

## Data Sources

In this section, we provide links to the diverse array of data sources utilized in our analysis. These sources encompass a wide range of domains crucial for understanding food security dynamics, including meteorological data repositories, governmental databases, and market price indices. By using these reputable and diverse sources, we ensure the richness and reliability of the data underpinning our analysis. These data are publicly available on the following data repository:
### Hospital and School Data
collected from [OpenStreet Map](https://www.openstreetmap.org/)

documentation [Documentation](https://osmnx.readthedocs.io/en/stable/)

[Education](https://wiki.openstreetmap.org/wiki/Map_features#Education)
[Healthcare](https://wiki.openstreetmap.org/wiki/Map_features#Healthcare)

[Some reference](https://www.statista.com/statistics/1297787/number-of-hospitals-in-tanzania-by-region/)

### Import Libraries

In [17]:
#!pip install osmnx

In [18]:
import pandas as pd
import os
from pandas import read_csv
import geopandas as gpd
import osmnx as ox
from geopandas import read_file
import matplotlib.pyplot as plt
%matplotlib inline
import geopandas as gpd
import osmnx as ox
import pandas as pd
import os

## Helper Function

## Visualization Function

## Extract Data From OpenStreet Map

In [19]:
def extract_amenities_with_bbox(districts_shapefile, output_directory):
    """
    Extracts the number of hospitals, clinics, and schools within each district based on shapefiles.
    
    Parameters:
    - districts_shapefile (str): The path to the shapefile containing district polygons.
    - output_directory (str): The directory where the output CSV, Excel, and GeoJSON files will be saved.
    """
    
    # Load the district shapefile
    districts = gpd.read_file(districts_shapefile)
    
    # Determine the column name for the region or province
    if 'region' in districts.columns:
        region_col = 'region'
    elif 'province' in districts.columns:
        region_col = 'province'
    else:
        raise ValueError("The shapefile must contain either a 'region' or 'province' column.")
    
    if 'district' not in districts.columns:
        raise ValueError("The shapefile must contain a 'district' column.")
    
    # Compute the bounding box of the entire shapefile
    bbox = districts.total_bounds  # returns (minx, miny, maxx, maxy)
    
    # Define the amenities to be extracted
    amenities = {
        'hospital': 'no_of_hospitals',
        'clinic': 'no_of_clinics',
        'school': 'no_of_schools',
        'college': 'no_of_colleges',
        'university': 'no_of_universities'
    }
    
    # Initialize a dataframe to store counts of amenities for each district
    district_amenity_counts = districts[[region_col, 'district']].copy()
    
    for amenity, output_column in amenities.items():
        # Download the amenity data from OSM using the bounding box
        amenity_data = ox.features_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], tags={'amenity': amenity})
        
        # Ensure both GeoDataFrames have the same CRS
        if districts.crs != amenity_data.crs:
            amenity_data = amenity_data.to_crs(districts.crs)
        
        # Extract relevant columns and ensure geometry type is correct
        amenity_data = amenity_data[['geometry']]
        amenity_data = amenity_data[amenity_data.geometry.type == 'Point']
        
        # Spatial join to find amenities within districts
        amenities_within_districts = gpd.sjoin(amenity_data, districts, how='inner', predicate='within')
        
        # Save the amenities within districts to a GeoJSON file
        geojson_output_path = f"{output_directory}/{output_column}_within_districts.geojson"
        amenities_within_districts.to_file(geojson_output_path, driver='GeoJSON')
        print(f"GeoJSON file for {amenity} is saved to: {geojson_output_path}")
        
        # Count amenities per district
        amenities_count = amenities_within_districts.groupby(['district']).size().reset_index(name=output_column)
        
        # Merge the counts with the main dataframe
        district_amenity_counts = district_amenity_counts.merge(amenities_count, on='district', how='left')
    
    # Fill NaN values with 0
    district_amenity_counts.fillna(0, inplace=True)
    
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Save the amenities count to a CSV file
    csv_output_path = f"{output_directory}/data_hospital_education.csv"
    district_amenity_counts.to_csv(csv_output_path, index=False)
    
    # Save the amenities count to an Excel file
    excel_output_path = f"{output_directory}/data_hospital_education.xlsx"
    district_amenity_counts.to_excel(excel_output_path, index=False)
    
    print(f"CSV file of amenities count is saved to: {csv_output_path}")
    print(f"Excel file of amenities count is saved to: {excel_output_path}")


In [46]:
def extract_waterways(district_shapefile_path, output_directory):
    """
    Extracts and normalizes the number of waterways within each district.
    
    Parameters:
    - district_shapefile_path (str): The path to the shapefile containing district polygons.
    - output_directory (str): The directory where the output CSV and GeoJSON files will be saved.
    """
    
    # Load the district shapefile
    districts_gdf = gpd.read_file(district_shapefile_path)
    
    columns = districts_gdf.columns
    
    if 'region' in columns:
        region_column_name = 'region'
    elif 'province' in columns:
        region_column_name = 'province'
    else:
        raise ValueError("Region column not found. Please check the column names in the shapefile.")
    
    
    # Compute the bounding box of the entire shapefile
    bbox = districts_gdf.total_bounds  # returns (minx, miny, maxx, maxy)
    
    water_tags = {
        'waterway': True,  # general tag for all types of waterways
        'natural': ['water', 'wetland'],  # natural water features
    }
    
    # Download the waterways from OSM using the bounding box
    waterways = ox.features_from_bbox(bbox[3], bbox[1], bbox[2], bbox[0], tags=water_tags)
    
    # Ensure both GeoDataFrames have the same CRS
    if districts_gdf.crs != waterways.crs:
        waterways = waterways.to_crs(districts_gdf.crs)
    
    # Extract relevant columns and ensure geometry type is correct
    districts_gdf = districts_gdf[[region_column_name,'geometry', 'district']]
    districts_gdf = districts_gdf[districts_gdf.geometry.type == 'Polygon']
    waterways = waterways[['geometry']]
    waterways = waterways[waterways.geometry.type == 'LineString']
    
    # Ensure geometries are in a projected CRS (meters) for area calculation
    if districts_gdf.crs.is_geographic:
        districts_gdf = districts_gdf.to_crs(epsg=3395)  # Using Mercator projection for accurate area calculations
    if waterways.crs.is_geographic:
        waterways = waterways.to_crs(epsg=3395)
    
    # List to store results
    results = []
    
    # List to store GeoDataFrames for waterways within each district
    waterways_geojson_list = []
    
    # Iterate over each district and count waterways
    for _, row in districts_gdf.iterrows():
        district_geom = row['geometry']
        district_area_km2 = district_geom.area / 1e6  # Convert area to km^2
        
        # Get the waterways within the district
        waterways_within_district = waterways[waterways.intersects(district_geom)]
        
        # Save the waterways within this district to the list
        waterways_geojson_list.append(waterways_within_district)
        
        # Count the number of waterways
        num_waterways = len(waterways_within_district)
        
        # Normalize the number of waterways by area (waterways per km^2)
        waterways_per_km2 = num_waterways / district_area_km2 if district_area_km2 != 0 else 0
        
        # Append the results
        results.append({
            region_column_name: row[region_column_name],
            'district': row['district'],
            'no_of_waterways': num_waterways,
            'area_km2': district_area_km2,
            'waterways_per_km2': waterways_per_km2
        })
    
    # Ensure districts with no waterways get a value of 0 for other columns
    for _, row in districts_gdf.iterrows():
        if row['district'] not in [result['district'] for result in results]:
            results.append({
                region_column_name: row[region_column_name],
                'district': row['district'],
                'no_of_waterways': 0,
                'area_km2': 0,
                'waterways_per_km2': 0
            })
    
    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    
    # Group by region and district and calculate the average number of waterways per km^2
    grouped_df = results_df.groupby([region_column_name, 'district']).agg({
        'no_of_waterways': 'sum', 
        'area_km2': 'first', 
        'waterways_per_km2': 'mean'
    }).reset_index()
    
    # Define output file paths
    output_csv_path = os.path.join(output_directory, "data_waterways.csv")
    output_excel_path = os.path.join(output_directory, "data_waterways.xlsx")
    output_geojson_path = os.path.join(output_directory, "waterways_within_districts.geojson")
    
    # Save the results to a CSV file
    grouped_df.to_csv(output_csv_path, index=False)
    grouped_df.to_excel(output_excel_path, index=False)
    
    # Combine the individual GeoDataFrames into a single GeoDataFrame
    waterways_within_districts = gpd.GeoDataFrame(pd.concat(waterways_geojson_list, ignore_index=True))
    
    # Save the waterways within districts as a GeoJSON file
    waterways_within_districts.to_file(output_geojson_path, driver='GeoJSON')
    
    print(f"CSV file of waterways data is saved to: {output_csv_path}")
    print(f"GeoJSON file of waterways within districts is saved to: {output_geojson_path}")


In [1]:
import os
import pandas as pd
import geopandas as gpd
import osmnx as ox

def extract_waterways_with_missing(district_shapefile_path, missing_districts_csv_path, output_directory):
    """
    Extracts and normalizes the number of waterways within each district.
    
    Parameters:
    - district_shapefile_path (str): The path to the shapefile containing district polygons.
    - missing_districts_csv_path (str): The path to the CSV file containing the missing districts.
    - output_directory (str): The directory where the output CSV and GeoJSON files will be saved.
    """
    
    # Load the district shapefile
    districts_gdf = gpd.read_file(district_shapefile_path)
    
    # Load the missing districts CSV
    missing_districts_df = pd.read_csv(missing_districts_csv_path)
    missing_districts_list = missing_districts_df['district'].tolist()
    
    # Filter districts_gdf to include only the missing districts
    districts_gdf = districts_gdf[districts_gdf['district'].isin(missing_districts_list)]
    
    columns = districts_gdf.columns
    
    if 'region' in columns:
        region_column_name = 'region'
    elif 'province' in columns:
        region_column_name = 'province'
    else:
        raise ValueError("Region column not found. Please check the column names in the shapefile.")
    
    # Initialize a list to store the results
    results = []
    
    # Initialize a list to store GeoDataFrames for waterways within each district
    waterways_geojson_list = []
    
    # Define tags to search for waterways in OSM
    water_tags = {
        'waterway': True,  # general tag for all types of waterways
        'natural': ['water', 'wetland'],  # natural water features
    }
    
    # Iterate over each district and extract waterways
    for _, row in districts_gdf.iterrows():
        district_geom = row['geometry']
        district_name = row['district']
        
        # Compute the bounding box of the district
        minx, miny, maxx, maxy = district_geom.bounds
        
        # Download the waterways from OSM using the bounding box
        waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
        
        # Ensure both GeoDataFrames have the same CRS
        if districts_gdf.crs != waterways.crs:
            waterways = waterways.to_crs(districts_gdf.crs)
        
        # Ensure the geometries are of the correct type
        waterways = waterways[waterways.geometry.type == 'LineString']
        
        # Calculate the area of the district in km^2
        district_area_km2 = district_geom.area / 1e6  # Convert area to km^2
        
        # Get the waterways within the district
        waterways_within_district = waterways[waterways.intersects(district_geom)]
        
        # Save the waterways within this district to the list
        waterways_geojson_list.append(waterways_within_district)
        
        # Count the number of waterways
        num_waterways = len(waterways_within_district)
        
        # Normalize the number of waterways by area (waterways per km^2)
        waterways_per_km2 = num_waterways / district_area_km2 if district_area_km2 != 0 else 0
        
        # Append the results
        results.append({
            region_column_name: row[region_column_name],
            'district': district_name,
            'no_of_waterways': num_waterways,
            'area_km2': district_area_km2,
            'waterways_per_km2': waterways_per_km2
        })
    
    # Ensure districts with no waterways get a value of 0 for other columns
    for _, row in districts_gdf.iterrows():
        if row['district'] not in [result['district'] for result in results]:
            results.append({
                region_column_name: row[region_column_name],
                'district': row['district'],
                'no_of_waterways': 0,
                'area_km2': 0,
                'waterways_per_km2': 0
            })
    
    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    
    # Group by region and district and calculate the average number of waterways per km^2
    grouped_df = results_df.groupby([region_column_name, 'district']).agg({
        'no_of_waterways': 'sum', 
        'area_km2': 'first', 
        'waterways_per_km2': 'mean'
    }).reset_index()
    
    # Define output file paths
    output_csv_path = os.path.join(output_directory, "data_waterways_missing.csv")
    output_excel_path = os.path.join(output_directory, "data_waterways_missing.xlsx")
    output_geojson_path = os.path.join(output_directory, "waterways_within_districts_missing.geojson")
    
    # Save the results to a CSV file
    grouped_df.to_csv(output_csv_path, index=False)
    grouped_df.to_excel(output_excel_path, index=False)
    
    # Combine the individual GeoDataFrames into a single GeoDataFrame
    waterways_within_districts = gpd.GeoDataFrame(pd.concat(waterways_geojson_list, ignore_index=True))
    
    # Save the waterways within districts as a GeoJSON file
    waterways_within_districts.to_file(output_geojson_path, driver='GeoJSON')
    
    print(f"CSV file of waterways data is saved to: {output_csv_path}")
    print(f"GeoJSON file of waterways within districts is saved to: {output_geojson_path}")


### Extract Hospital, School , Market Places and Water Ways Data For Tanzania

In [3]:
#define a data directory and load the district shapefile
data_dir = 'tanzania_data/'
missing = data_dir+ 'hospital_school_data/missing_districts.csv'
shapefile = data_dir + 'shapefiles/tz_districts.shp'
output_directory = data_dir+ 'hospital_school_data/new/'

#extract_waterways(shapefile, output_directory) #extract no of waterways
extract_waterways_with_missing(shapefile, missing, output_directory)
#extract_amenities_with_bbox(shapefile, output_directory)

  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)

  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)
  waterways = ox.features_from_bbox(maxy, miny, maxx, minx, tags=water_tags)


UnboundLocalError: local variable 'response' referenced before assignment

### Extract Hospital and School Data For Rwanda

In [48]:
#define a data directory
data_dir = 'rwanda_data/'
region_name = 'Rwanda'
#load shapefile of districts
shapefile = data_dir + 'shapefiles/rw_district.shp'
output_directory = data_dir+ 'hospital_school_data'

#extract_waterways(shapefile, output_directory)

#extract_amenities_with_bbox(shapefile, output_directory)