### Workbook for comparing population counts for villages surrounding Rwanda bridges
Weeks of April 21 & 28, 2025 |
Author: Adele Birkenes

In [14]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString, Polygon
import rasterio
import os
import numpy as np

Task 1: Read in needs assessment data as dataframe and Rwanda village boundaries as geodataframe

In [15]:
# Specify file path for needs assessment data
needs_assessment_fp = os.path.join("../../synced-data/population-exploration/", "Rwanda Sites with All Population Fields_Exported 2025.04.11.csv")

# Read the .csv file into a dataframe with an encoding that accommodates special characters
needs_assessment = pd.read_csv(needs_assessment_fp, encoding='ISO-8859-1')

# Print first 3 rows of needs assessment data
print(needs_assessment.head(3))

  Bridge Opportunity: CaseSafeID Bridge Opportunity: Project Code  \
0             006f100000a86CpAAI                          1007327   
1             006f100000a86CpAAI                          1007327   
2             006f100000a86CtAAI                          1007331   

  Bridge Opportunity: Country Bridge Opportunity: Opportunity Name  \
0                      Rwanda            Rwanda - Gikana - 1007327   
1                      Rwanda            Rwanda - Gikana - 1007327   
2                      Rwanda        Rwanda - Nyamigende - 1007331   

  Bridge Opportunity: Stage Bridge Opportunity: Sub-Stage  \
0                  Rejected                     Technical   
1                  Rejected                     Technical   
2                  Rejected                           NaN   

  Bridge Opportunity: Close Date Bridge Opportunity: Level 1 Government  \
0                       11/30/18                       Eastern Province   
1                       11/30/18               

In [16]:
# Specify file path for Rwanda village boundaries
Rwanda_village_boundaries_fp = os.path.join("../../synced-data/population-exploration/Rwanda Village Boundaries", "Village.shp")

# Read the shapefile into a geodataframe
Rwanda_village_boundaries = gpd.read_file(Rwanda_village_boundaries_fp)

# Print CRS of geodataframe
print(f'The CRS of Rwanda_village_boundaries is {Rwanda_village_boundaries.crs}\n')

# Print first 3 rows of village boundaries data
print(Rwanda_village_boundaries.head(3))

The CRS of Rwanda_village_boundaries is PROJCS["TM_Rwanda",GEOGCS["ITRF2005",DATUM["International_Terrestrial_Reference_Frame_2005",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6896"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",30],PARAMETER["scale_factor",0.9999],PARAMETER["false_easting",500000],PARAMETER["false_northing",5000000],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]

     District Village_ID Cell_ID Sector_ID Distr_ID Prov_ID        Name  \
0  Nyarugenge   11010102  110101      1101       11       1     Gihanga   
1  Nyarugenge   11010103  110101      1101       11       1  Iterambere   
2  Nyarugenge   11010104  110101      1101       11       1       Izuba   

        Cell  Sector                      Province  \
0  Akabahizi  Gitega  Kigali Town/Umujyi wa Kigali   
1  Akabahizi  Git

Task 2: Filter bridges to those that have been completed and create new fields to list the villages associated with each bridge and their summed population counts (according to the needs assessments)

In [17]:
# Filter bridges to those that have been completed
completed_bridges = needs_assessment[needs_assessment['Bridge Opportunity: Stage'] == 'Complete'].copy()

# Combine village names from six columns into one list per bridge
village_columns = ['Community Served 1', 'Community Served 2', 'Community Served 3', 'Community Served 4', 'Community Served 5', 'Community Served 6']
completed_bridges['Village List'] = completed_bridges[village_columns].apply(lambda row: [village for village in row if pd.notna(village)], axis=1)

# Sum population counts for all villages served by each bridge
population_columns = ['Individuals served in Community 1', 'Individuals served in Community 2',
                     'Individuals served in Community 3', 'Individuals served in Community 4',
                     'Individuals served in Community 5', 'Individuals Served in Community 6'] #note that case is different for community 6
completed_bridges['Total Needs Assessment Pop'] = completed_bridges[population_columns].apply(lambda row: row.sum(), axis=1)

# Display the results
print(completed_bridges[['Village List', 'Total Needs Assessment Pop']].head())

# Double-check summing - Display rows where the total sum does not match the sum of the six population columns
mismatched_rows = completed_bridges[completed_bridges['Total Needs Assessment Pop'] != completed_bridges[population_columns].sum(axis=1)]
print(f'\nMismatched rows: {mismatched_rows}')

                                      Village List  Total Needs Assessment Pop
24  [Kiziba, Bishingwe, Kabukoko, Ntazi, Mugonero]                      2946.0
31        [Cyabami, Rugaragara, Butunda, Kadehero]                      2554.0
71               [Nyabizi 1, Nyabizi 2, Nyabizi 3]                      1854.0
85    [Gisenyi, Mwambi, Nyakigezi, Mubuga, Kibaya]                      5142.0
86                               [Kibaya, Gisenyi]                      1230.0

Mismatched rows: Empty DataFrame
Columns: [Bridge Opportunity: CaseSafeID, Bridge Opportunity: Project Code, Bridge Opportunity: Country, Bridge Opportunity: Opportunity Name, Bridge Opportunity: Stage, Bridge Opportunity: Sub-Stage, Bridge Opportunity: Close Date, Bridge Opportunity: Level 1 Government, Bridge Opportunity: Level 2 Government, Bridge Opportunity: Level 3 Government, Bridge Opportunity: Level 4 Government, Bridge Name, Bridge Opportunity: River Name, Bridge Opportunity: Research Initiative, Bridge Opport

Task 3: Create dataframe of communities included in needs assessments, where each row is a community (admin 5) and contains the community's associated population

In [18]:
# Add unique identifier to each row in needs assessment dataframe to allow for matching of communities and their population counts
# Note: Cannot use CaseSafeID, as some CaseSafeIDs correspond to multiple rows in dataset
needs_assessment['Row ID'] = needs_assessment.index

# Select community columns
community_columns = ['Community Served 1', 'Community Served 2', 'Community Served 3', 
                     'Community Served 4', 'Community Served 5', 'Community Served 6']

# Melt the needs assessment dataframe to create a long format table, where each row corresponds to a community served by a bridge
communities_served = needs_assessment.melt(
    id_vars=needs_assessment.columns[:21].tolist() + ['Row ID'],  # Preserve all fields up to "Bridge Opportunity: Population Estimate 5000m" and include "Row ID"
    value_vars=community_columns, 
    var_name='Community Column', 
    value_name='Community Name')

# Display the first few rows of the melted dataframe
print(communities_served.head())

# Select individuals served count columns
individuals_served_columns = ['Individuals served in Community 1', 'Individuals served in Community 2', 
                              'Individuals served in Community 3', 'Individuals served in Community 4', 
                              'Individuals served in Community 5', 'Individuals Served in Community 6']

# Create a separate melted dataframe for individuals served counts
individuals_served = needs_assessment.melt(
    id_vars=['Bridge Opportunity: CaseSafeID', 'Row ID'],
    value_vars=individuals_served_columns,
    var_name='Individuals Served Column',
    value_name='Individuals Served')

# Display the first few rows of the melted individuals served dataframe
print(individuals_served.head())

  Bridge Opportunity: CaseSafeID Bridge Opportunity: Project Code  \
0             006f100000a86CpAAI                          1007327   
1             006f100000a86CpAAI                          1007327   
2             006f100000a86CtAAI                          1007331   
3             006f100000a86CtAAI                          1007331   
4             006f100000efEkkAAE                          1014568   

  Bridge Opportunity: Country Bridge Opportunity: Opportunity Name  \
0                      Rwanda            Rwanda - Gikana - 1007327   
1                      Rwanda            Rwanda - Gikana - 1007327   
2                      Rwanda        Rwanda - Nyamigende - 1007331   
3                      Rwanda        Rwanda - Nyamigende - 1007331   
4                      Rwanda      Rwanda - Nyirarukobwa - 1014568   

  Bridge Opportunity: Stage Bridge Opportunity: Sub-Stage  \
0                  Rejected                     Technical   
1                  Rejected               

In [19]:
# Create new columns for community number in both dataframes (extracting the numbers from the dataframes' respective community columns)
communities_served['Community Number'] = communities_served['Community Column'].str.extract(r'(\d+)').astype(str)
individuals_served['Community Number'] = individuals_served['Individuals Served Column'].str.extract(r'(\d+)').astype(str)

# Merge the two dataframes on row ID and community number
communities_served = communities_served.merge(
    individuals_served,
    on=['Row ID', 'Community Number'],
    how='left')

# Drop rows where 'Community Name' is NaN
communities_served = communities_served.dropna(subset=['Community Name'])

# Display the resulting table
print(communities_served.head())

  Bridge Opportunity: CaseSafeID_x Bridge Opportunity: Project Code  \
0               006f100000a86CpAAI                          1007327   
2               006f100000a86CtAAI                          1007331   
3               006f100000a86CtAAI                          1007331   
4               006f100000efEkkAAE                          1014568   
5               006f100000efEkpAAE                          1014569   

  Bridge Opportunity: Country Bridge Opportunity: Opportunity Name  \
0                      Rwanda            Rwanda - Gikana - 1007327   
2                      Rwanda        Rwanda - Nyamigende - 1007331   
3                      Rwanda        Rwanda - Nyamigende - 1007331   
4                      Rwanda      Rwanda - Nyirarukobwa - 1014568   
5                      Rwanda            Rwanda - Kibaza - 1014569   

  Bridge Opportunity: Stage Bridge Opportunity: Sub-Stage  \
0                  Rejected                     Technical   
2                  Rejected   

Task 4: Join needs assessment communities with Rwanda admin boundaries

| Admin level | Name in needs assessment                  | Name in admin boundaries |
|-------------|-------------------------------------------|--------------------------|
| 5           | Community Name                         | Name                     |
| 4           | Bridge Opportunity: Level 4 Government | Cell                   |
| 3           | Bridge Opportunity: Level 3 Government | Sector                 |
| 2           | Bridge Opportunity: Level 2 Government | District               |
| 1           | Bridge Opportunity: Level 1 Government | Province               |

In [20]:
# Create copies of the communities served dataframe and village boundaries geodataframe
communities_served_copy = communities_served.copy()
village_boundaries_copy = Rwanda_village_boundaries.copy()

# Rename columns in village boundaries geodataframe
village_boundaries_copy = village_boundaries_copy.rename(columns={'Name': 'Admin 5',
                                                                  'Cell': 'Admin 4',
                                                                  'Sector': 'Admin 3',
                                                                  'District': 'Admin 2',
                                                                  'Province': 'Admin 1' })

# Rename columns in communities served dataframe
communities_served_copy = communities_served_copy.rename(columns={'Community Name': 'Admin 5',
                                                                  'Bridge Opportunity: Level 4 Government': 'Admin 4',
                                                                  'Bridge Opportunity: Level 3 Government': 'Admin 3',
                                                                  'Bridge Opportunity: Level 2 Government': 'Admin 2',
                                                                  'Bridge Opportunity: Level 1 Government': 'Admin 1'})

Combo 1: Joining on admin 3-5

In [26]:
# Perform a join on admin 3-5
join_combo_1 = village_boundaries_copy.merge(communities_served_copy, 
                                             on=['Admin 3', 'Admin 4', 'Admin 5'],
                                             how='left')

# Identify rows in the communities served dataframe that were not matched in the join
unmatched_communities_combo_1 = communities_served_copy[~communities_served_copy['Admin 3'].isin(join_combo_1['Admin 3']) | 
                                                        ~communities_served_copy['Admin 4'].isin(join_combo_1['Admin 4']) | 
                                                        ~communities_served_copy['Admin 5'].isin(join_combo_1['Admin 5'])]

# Print number of matched & unmatched communities
print(f'Number of matched communities:\n{join_combo_1.shape[0]}')
print(f'Number of unmatched communities:\n{unmatched_communities_combo_1.shape[0]}')

Number of matched communities:
15021
Number of unmatched communities:
3980


Combo 2: Joining on admin 3, 5

In [27]:
# Perform a join on admin 3, 5
join_combo_2 = village_boundaries_copy.merge(communities_served_copy, 
                                             on=['Admin 3', 'Admin 5'],
                                             how='left')

# Identify rows in the communities served dataframe that were not matched in the join
unmatched_communities_combo_2 = communities_served_copy[~communities_served_copy['Admin 3'].isin(join_combo_2['Admin 3']) | 
                                                        ~communities_served_copy['Admin 5'].isin(join_combo_2['Admin 5'])]

# Print number of matched & unmatched communities
print(f'Number of matched communities:\n{join_combo_2.shape[0]}')
print(f'Number of unmatched communities:\n{unmatched_communities_combo_2.shape[0]}')

Number of matched communities:
15561
Number of unmatched communities:
2475


Combo 3: Joining on admin 2-3, 5

In [28]:
# Perform a join on admin 3, 5
join_combo_3 = village_boundaries_copy.merge(communities_served_copy, 
                                             on=['Admin 2', 'Admin 3', 'Admin 5'],
                                             how='left')

# Identify rows in the communities served dataframe that were not matched in the join
unmatched_communities_combo_3 = communities_served_copy[~communities_served_copy['Admin 2'].isin(join_combo_3['Admin 2']) |
                                                        ~communities_served_copy['Admin 3'].isin(join_combo_3['Admin 3']) | 
                                                        ~communities_served_copy['Admin 5'].isin(join_combo_3['Admin 5'])]

# Print number of matched & unmatched communities
print(f'Number of matched communities:\n{join_combo_3.shape[0]}')
print(f'Number of unmatched communities:\n{unmatched_communities_combo_3.shape[0]}')

Number of matched communities:
15547
Number of unmatched communities:
2475


Task 5: Explore results of the joins and identify unmatched communities

Based on these three join combinations, joining on admin 3 & 5 and on admin 2-3 & 5 yields the highest number of matched communities. In both cases, the unmatched community count is 2475. Therefore, the next step is to identify the unmatched communities and determine if they can be matched to the admin boundaries.

Going forward, we will use combo #3 since it includes the highest admin level included in the join combos (admin 2) and thus ensures greater accuracy in matching communities.

In [30]:
# Check for admin 2 mismatches in the unmatched communities from combo 3
admin_2_mismatches = unmatched_communities_combo_3[
    ~unmatched_communities_combo_3['Admin 2'].isin(village_boundaries_copy['Admin 2'])
]
print(f"Number of Admin 2 mismatches: {len(admin_2_mismatches)}")

# Check for admin 3 mismatches in the unmatched communities from combo 3
admin_3_mismatches = unmatched_communities_combo_3[
    ~unmatched_communities_combo_3['Admin 3'].isin(village_boundaries_copy['Admin 3'])
]
print(f"Number of Admin 3 mismatches: {len(admin_3_mismatches)}")

# Check for admin 5 mismatches in the unmatched communities from combo 3
admin_5_mismatches = unmatched_communities_combo_3[
    ~unmatched_communities_combo_3['Admin 5'].isin(village_boundaries_copy['Admin 5'])
]
print(f"Number of Admin 5 mismatches: {len(admin_5_mismatches)}")

Number of Admin 2 mismatches: 0
Number of Admin 3 mismatches: 1861
Number of Admin 5 mismatches: 1045


In [32]:
# Check whether the admin 5 names in the needs assessment data actually match with the admin 3 names in the admin boundaries data
admin_5_to_3_matching = admin_5_mismatches[admin_5_mismatches['Admin 5'].isin(village_boundaries_copy['Admin 3'])]

print(f"Number of Admin 5 names that match Admin 3 names: {len(admin_5_to_3_matching)}")

# Check whether the admin 3 names in the needs assessment data actually match with the admin 5 names in the admin boundaries data
admin_3_to_5_matching = admin_3_mismatches[admin_3_mismatches['Admin 3'].isin(village_boundaries_copy['Admin 5'])]

print(f"Number of Admin 3 names that match Admin 5 names: {len(admin_3_to_5_matching)}")

Number of Admin 5 names that match Admin 3 names: 23
Number of Admin 3 names that match Admin 5 names: 173
